This notebook is the first pass at cleaning the aggregated g2f data. It will draw on code written for past projects, specifically
maizemodelandg2fd. I will also use random forests to help identify which the highest value targets for data cleaning are.
# ! mamba install pandas -y
# ! mamba install scikit-learn -y
# ! mamba install matplotlib -y
# ! mamba install plotly -y
# ! conda install openpyxl -y
# ! pip install tqdm
# ! conda install -c conda-forge optuna -y
# !conda install -c conda-forge py-xgboost-gpu -y
import os, json, requests # for downloading power data with `dl_power_data()`
import glob
import re
import time # add a delay between requests to NASA POWER API for weather data
import pickle as pkl
import numpy as np
from numpy import random
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.impute import KNNImputer # for imputing soil for ARH1_2016 & ARH2_2016
from sklearn import preprocessing # LabelEncoder
from sklearn.metrics import mean_squared_error # if squared=False; RMSE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import plotly.express as px
import tqdm
import optuna
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
# from g2f_comp.internal import *
Using cpu device
cache_path = './notebook_artifacts/0_datacleaning/'
train_dir= "./data/Maize_GxE_Competition_Data/Training_Data/"
test_dir = "./data/Maize_GxE_Competition_Data/Testing_Data/"
# Handy one liners for finding mismatched data types
# find disagreeing types
# [e for e in [e for e in list(x_test) if e in list(x_train)] if (np.dtype(x_train[e]) != np.dtype(x_test[e])) ]
# [(e, np.dtype(x_train[e]), np.dtype(x_test[e])) for e in [e for e in list(x_test) if e in list(x_train)] if (np.dtype(x_train[e]) != np.dtype(x_test[e])) ]
# Aggregate train/test into a single set of dfs
# phenotype/trait
x_train = pd.read_csv(train_dir+'1_Training_Trait_Data_2014_2021.csv')
x_test = pd.read_csv(test_dir+'1_Submission_Template_2022.csv')
# use env to fill in other keys
# x_test[["Field_Location", "Year"]] = x_test['Env'].str.split('_',expand=True)
# x_test["Year"] = x_test["Year"].astype(int)
phno = x_train.merge(x_test, "outer")
# metadata
# I've manually resaved this csv as an xlsx to get around this error with the csv:
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xca in position 5254: invalid continuation byte
x_train = pd.read_excel(train_dir+'2_Training_Meta_Data_2014_2021.xlsx')
x_test = pd.read_csv(test_dir+'2_Testing_Meta_Data_2022.csv')
x_test['Date_weather_station_placed'] = pd.to_datetime(x_test.Date_weather_station_placed)
x_test['Date_weather_station_removed'] = pd.to_datetime(x_test.Date_weather_station_removed)
# to string
for e in ['Weather_Station_Serial_Number (Last four digits, e.g. m2700s#####)',
'Issue/comment_#5',
'Issue/comment_#6',
'Comments']:
x_test[e] = x_test[e].astype(str)
meta = x_train.merge(x_test, how = "outer")
# soil
x_train = pd.read_csv(train_dir+'3_Training_Soil_Data_2015_2021.csv')
x_test = pd.read_csv(test_dir+'3_Testing_Soil_Data_2022.csv')
for e in ['E Depth',
'lbs N/A',
'Potassium ppm K',
'Calcium ppm Ca',
'Magnesium ppm Mg',
'Sodium ppm Na',
'%H Sat',
'%K Sat',
'%Ca Sat',
'%Mg Sat',
'%Na Sat',
'Mehlich P-III ppm P']:
x_test[e] = x_test[e].astype(float)
x_test['Comments'] = x_test['Comments'].astype(str)
soil = x_train.merge(x_test, how = "outer")
# weather
x_train = pd.read_csv(train_dir+'4_Training_Weather_Data_2014_2021.csv')
x_test = pd.read_csv(test_dir+'4_Testing_Weather_Data_2022.csv')
wthr = x_train.merge(x_test, how = "outer")
# genomic
# TODO
# geno
# x_train = pd.read_csv(train_dir+'All_hybrid_names_info.csv')
# "./data/Maize_GxE_Competition_Data/Training_Data"
# '5_Genotype_Data_All_Years.vcf.zip',
# '6_Training_EC_Data_2014_2021.csv',
# 'All_hybrid_names_info.csv',
# 'GenoDataSources.txt'
# crop growth model variables (enviromental covariates)
x_train = pd.read_csv(train_dir+'6_Training_EC_Data_2014_2021.csv')
x_test = pd.read_csv(test_dir+'6_Testing_EC_Data_2022.csv')
cgmv = x_train.merge(x_test, how = "outer")
# from g2fd
def find_shared_cols(
df1,# = df,
df2# = meta
):
shared_cols = [e for e in list(df1) if e in list(df2)]
return(shared_cols)
# Changes involving only 1 data frame =======================================
# phno
# object to datetime
phno.Date_Planted = pd.to_datetime(phno.Date_Planted)
phno.Date_Harvested = pd.to_datetime(phno.Date_Harvested)
# Check to make sure there are no missing values for these
assert [] == [e for e in list(cgmv) if 0 != np.mean(cgmv[e].isna())]
# Changes involving _2_ data frames =========================================
# phno and meta share 'Plot_Area_ha', 'Date_Planted' these need to be checked for consistency and moved.
# use meta to fill in pheno then drop
# meta has the information from 22 and nothing else. Reverse for phno
for e in ['Plot_Area_ha', 'Date_Planted']:
mask = phno[e].isna()
fill_ins = phno.loc[mask, ['Env', 'Year']].drop_duplicates().reset_index()
# for all the unique key combinations find the value that should be inserted and do so.
for i in range(fill_ins.shape[0]):
phno_mask = ((phno.Env == fill_ins.loc[i,"Env"]) &
(phno.Year == fill_ins.loc[i,"Year"]) )
meta_mask = ((meta.Env == fill_ins.loc[i,"Env"]) &
(meta.Year == fill_ins.loc[i,"Year"]) )
insert_val = list(meta.loc[meta_mask, e])
insert_val = [e for e in insert_val if e == e] # get rid of nans
if insert_val == []:
pass
else:
assert len(insert_val) == 1 # check that there's only one value to imput
phno.loc[phno_mask, e] = insert_val[0]
meta = meta.drop(columns=['Plot_Area_ha', 'Date_Planted'])
#from g2fd
# generalized version of `sanitize_Experiment_Codes`
def sanitize_col(df, col, simple_renames= {}, split_renames= {}):
# simple renames
for e in simple_renames.keys():
mask = (df[col] == e)
df.loc[mask, col] = simple_renames[e]
# splits
# pull out the relevant multiname rows, copy, rename, append
for e in split_renames.keys():
mask = (df[col] == e)
temp = df.loc[mask, :]
df = df.loc[~mask, :]
for e2 in split_renames[e]:
temp2 = temp.copy()
temp2[col] = e2
df = df.merge(temp2, how = 'outer')
return(df)
def summarize_col_missing(df):
return(
pd.DataFrame({'Col' : [e for e in list(df)],
'N_miss' : [sum(df[e].isna()) for e in list(df)],
'Pr_Comp': [round(100*(1-sum(df[e].isna())/len(df[e])), 1) for e in list(df)]})
)
# summarize_col_missing(df = meta)
rm_Envs = ['GEH1_2020', 'GEH1_2021', 'GEH1_2019' # Germany locations
# 'ARH1_2016', 'ARH2_2016' # Only in 2016, missing soil data
]
phno = phno.loc[~phno.Env.isin(rm_Envs), :]
meta = meta.loc[~meta.Env.isin(rm_Envs), :]
soil = soil.loc[~soil.Env.isin(rm_Envs), :]
wthr = wthr.loc[~wthr.Env.isin(rm_Envs), :]
cgmv = cgmv.loc[~cgmv.Env.isin(rm_Envs), :]
# Fix types: object to datetime
phno.Date_Planted = pd.to_datetime(phno.Date_Planted)
phno.Date_Harvested = pd.to_datetime(phno.Date_Harvested)
# FIXME this may have been vestigial, everthing got shuffled by by jupytext.
# # phno and meta share 'Plot_Area_ha', 'Date_Planted' these need to be checked for consistency and moved.
# # use meta to fill in pheno then drop
# # meta has the information from 22 and nothing else. Reverse for phno
# for e in ['Plot_Area_ha', 'Date_Planted']:
# mask = phno[e].isna()
# fill_ins = phno.loc[mask, ['Env', 'Year']].drop_duplicates().reset_index()
# # for all the unique key combinations find the value that should be inserted and do so.
# for i in range(fill_ins.shape[0]):
# phno_mask = ((phno.Env == fill_ins.loc[i,"Env"]) &
# (phno.Year == fill_ins.loc[i,"Year"]) )
# meta_mask = ((meta.Env == fill_ins.loc[i,"Env"]) &
# (meta.Year == fill_ins.loc[i,"Year"]) )
# insert_val = list(meta.loc[meta_mask, e])
# insert_val = [e for e in insert_val if e == e] # get rid of nans
# if insert_val == []:
# pass
# else:
# assert len(insert_val) == 1 # check that there's only one value to imput
# phno.loc[phno_mask, e] = insert_val[0]
# # meta = meta.drop(columns=['Plot_Area_ha', 'Date_Planted'])
# meta = meta.drop(columns=[e for e in list(meta) if e in ['Plot_Area_ha', 'Date_Planted']])
# drop and redo year and Env
phno = phno.drop(columns = ['Year'])
meta = meta.drop(columns = ['Year'])
soil = soil.drop(columns = ['Year'])
temp = pd.DataFrame(phno['Env']).drop_duplicates().reset_index().drop(columns = 'index')
temp['Env2'] = temp['Env']
temp = sanitize_col(
df = temp,
col = 'Env2',
simple_renames= {
'MOH1_1_2018': 'MOH1-1_2018',
'MOH1_2_2018': 'MOH1-2_2018',
'MOH1_1_2020': 'MOH1-1_2020',
'MOH1_2_2020': 'MOH1-2_2020'
},
split_renames= {})
assert [] == [e for e in list(temp['Env2']) if len(e.split('_')) != 2]
temp[["Experiment_Code", "Year"]] = temp['Env2'].str.split('_',expand=True)
temp = temp.drop(columns = ['Experiment_Code'])
phno = temp.merge(phno).drop(columns = ['Env']).rename(columns = {'Env2':'Env'})
meta = temp.merge(meta).drop(columns = ['Env']).rename(columns = {'Env2':'Env'})
soil = temp.merge(soil).drop(columns = ['Env']).rename(columns = {'Env2':'Env'})
wthr = temp.merge(wthr).drop(columns = ['Env']).rename(columns = {'Env2':'Env'})
cgmv = temp.merge(cgmv).drop(columns = ['Env']).rename(columns = {'Env2':'Env'})
(find_shared_cols(phno, meta),
find_shared_cols(phno, soil),
find_shared_cols(phno, wthr),
find_shared_cols(phno, cgmv))
(['Env', 'Year'], ['Env', 'Year'], ['Env', 'Year'], ['Env', 'Year'])
confirm there are envs/gps coordinates for everyone
phno_Envs = list(set(phno['Env']))
meta_Envs = list(set(meta['Env']))
soil_Envs = list(set(soil['Env']))
wthr_Envs = list(set(wthr['Env']))
cgmv_Envs = list(set(cgmv['Env']))
all__Envs = list(set(phno_Envs+soil_Envs+wthr_Envs+cgmv_Envs))
def get_missing_envs(data_Envs = []):
return([e for e in all__Envs if e not in data_Envs])
phno_Envs_miss = get_missing_envs(data_Envs = phno_Envs)
meta_Envs_miss = get_missing_envs(data_Envs = meta_Envs)
soil_Envs_miss = get_missing_envs(data_Envs = soil_Envs)
wthr_Envs_miss = get_missing_envs(data_Envs = wthr_Envs)
cgmv_Envs_miss = get_missing_envs(data_Envs = cgmv_Envs)
# Write out to be imputed envs / wholely absent envs
if [] != phno_Envs_miss:
pd.DataFrame({'Absent_Envs':phno_Envs_miss}).to_csv('./data/Preparation/phno_Envs_miss.csv', index=False)
if [] != meta_Envs_miss:
pd.DataFrame({'Absent_Envs':meta_Envs_miss}).to_csv('./data/Preparation/meta_Envs_miss.csv', index=False)
if [] != soil_Envs_miss:
pd.DataFrame({'Absent_Envs':soil_Envs_miss}).to_csv('./data/Preparation/soil_Envs_miss.csv', index=False)
if [] != wthr_Envs_miss:
pd.DataFrame({'Absent_Envs':wthr_Envs_miss}).to_csv('./data/Preparation/wthr_Envs_miss.csv', index=False)
if [] != cgmv_Envs_miss:
pd.DataFrame({'Absent_Envs':cgmv_Envs_miss}).to_csv('./data/Preparation/cgmv_Envs_miss.csv', index=False)
assert [] == phno_Envs_miss
assert [] == meta_Envs_miss
add_envs = phno.loc[(phno.Env.isin(soil_Envs_miss)), ['Env', 'Year']].drop_duplicates()
soil = soil.merge(add_envs, how = 'outer')
soil_Envs_miss = get_missing_envs(data_Envs = list(set(soil['Env'])))
assert [] == soil_Envs_miss
add_envs = phno.loc[(phno.Env.isin(wthr_Envs_miss)), ['Env', 'Year']].drop_duplicates()
add_envs = add_envs.merge(
wthr.loc[(wthr.Year.isin(list(set(add_envs['Year'])))), ['Year', 'Date']].drop_duplicates(),
how = 'outer'
)
wthr = wthr.merge(add_envs, how = 'outer')
wthr_Envs_miss = get_missing_envs(data_Envs = list(set(wthr['Env'])))
assert [] == wthr_Envs_miss
add_envs = phno.loc[(phno.Env.isin(cgmv_Envs_miss)), ['Env', 'Year']].drop_duplicates()
cgmv = cgmv.merge(add_envs, how = 'outer')
cgmv_Envs_miss = get_missing_envs(data_Envs = list(set(cgmv['Env'])))
assert [] == cgmv_Envs_miss
# TODO add in hybrid processing here
# can use
# 'Hybrid',
# 'Hybrid_orig_name',
# 'Hybrid_Parent1',
# 'Hybrid_Parent2'
# regroup phenotype and meta Columns
df = phno.merge(meta)
cols_for_phno = [
'Env',
'Year',
'Hybrid',
'Yield_Mg_ha', #| <- Key target
'Stand_Count_plants', #|- Possible intermediate prediction targets
'Pollen_DAP_days', #|
'Silk_DAP_days', #|
'Plant_Height_cm', #|
'Ear_Height_cm', #|
'Root_Lodging_plants', #|
'Stalk_Lodging_plants', #|
'Grain_Moisture', #|
'Twt_kg_m3'] #|
cols_for_meta = [
'Env',
'Year',
# 'Field_Location',
# 'Experiment',
# 'Replicate',
# 'Block',
# 'Plot',
# 'Range',
# 'Pass',
'Hybrid',
# 'Hybrid_orig_name',
# 'Hybrid_Parent1',
# 'Hybrid_Parent2',
'Plot_Area_ha', #|- Needs Imputation
'Date_Planted', #|
'Date_Harvested', #|
'Experiment_Code',
'Treatment',
'City',
'Farm',
'Field',
'Trial_ID (Assigned by collaborator for internal reference)', #
'Soil_Taxonomic_ID and horizon description, if known', # 32.2% complete
'Weather_Station_Serial_Number (Last four digits, e.g. m2700s#####)',
'Weather_Station_Latitude (in decimal numbers NOT DMS)',
'Weather_Station_Longitude (in decimal numbers NOT DMS)',
# 'Date_weather_station_placed',
# 'Date_weather_station_removed',
'Previous_Crop',
'Pre-plant_tillage_method(s)',
'In-season_tillage_method(s)',
'Type_of_planter (fluted cone; belt cone; air planter)',
'System_Determining_Moisture',
'Pounds_Needed_Soil_Moisture',
'Latitude_of_Field_Corner_#1 (lower left)',
'Longitude_of_Field_Corner_#1 (lower left)',
'Latitude_of_Field_Corner_#2 (lower right)',
'Longitude_of_Field_Corner_#2 (lower right)',
'Latitude_of_Field_Corner_#3 (upper right)',
'Longitude_of_Field_Corner_#3 (upper right)',
'Latitude_of_Field_Corner_#4 (upper left)',
'Longitude_of_Field_Corner_#4 (upper left)',
'Cardinal_Heading_Pass_1',
'Irrigated']
cols_for_cmnt =[
'Env',
'Year',
'Issue/comment_#1',
'Issue/comment_#2',
'Issue/comment_#3',
'Issue/comment_#4',
'Issue/comment_#5',
'Issue/comment_#6',
'Comments']
phno = df.loc[:, cols_for_phno]
meta = df.loc[:, cols_for_meta]
cmnt = df.loc[:, cols_for_cmnt]
# Deal with Issues/Comments
# I think I'll ignore these fields. There's qualitative information we can get
# and additional weather data (below) but not a lot that will be immediately
# useful
cmnt = cmnt.drop_duplicates()
cmnt = cmnt.melt(id_vars = ['Env', 'Year'])
cmnt = cmnt.loc[(cmnt.value.notna()), :]
cmnt = cmnt.loc[(cmnt.value != 'nan'), :]
cmnts_2022 = list(set(list(cmnt.loc[cmnt.Year == "2022", 'value'])))
# cmnts_2022
# looking at comments from 2022 we could make use of some of these if we were
# making manual guesses but these won't be too helpful here.
# - 'Some very low yields caused by greensnap\n',
# - 'Deer damage on the early hybrids',
# - 'sandhill cranes pulled out seedlings in sections of the field in late May/early June',
# - 'Thunderstorm caused very high level of root lodging and some greensnap. Corn was upright at end of season, but mostly goosenecked.\nRoot lodging not recroded becuase of high percentage of goosenecking\nGreensnap not counted separately, so stalk lodging includes both goosenecking and stalk lodging\nSome very low yields caused by greensnap\n',
# However, the additional weather may be of use.
# TODO, get additional weather data
# 'Link to additional weather source available online: http://www.deos.udel.edu',
# 'Link to additional weather source available online: https://weather.cfaes.osu.edu//stationinfo.asp?id=13',
# 'Link to additional weather source available online: Georgia Weather - Automated Environmental Monitoring Network Page (uga.edu) - http://weather.uga.edu/mindex.php?content=calculator&variable=CC&site=TIFTON',
# 'Link to additional weather source available online: https://newa.cornell.edu/all-weather-data-query/ (select Aurora (CUAES Musgrave), NY)',
# 'Link to additional weather source available online: https://www.isws.illinois.edu/warm/stationmeta.asp?site=CMI&from=wx'
# more from pre 2022
# 'http://newa.cornell.edu/index.php?page=weather-station-page&WeatherStation=aur',
# 'Additional weather source available online: http://www.georgiaweather.net/?content=calculator&variable=CC&site=PENFIELD',
# 'http://www.deos.udel.edu',
# 'Additional weather data source: http://www.deos.udel.edu',
# ' https://soilseries.sc.egov.usda.gov/OSD_Docs/M/MEXICO.html',
# 'Link to additional weahter source: http://www.deos.udel.edu',
# 'Link to additional weahter source: Georgia Weather - Automated Environmental Monitoring Network Page (uga.edu) http://weather.uga.edu/mindex.php?variable=HI&site=TIFTON',
# 'Additional weather data for the farm can be found at: http://newa.cornell.edu/index.php?page=weather-station-page&WeatherStation=aur',
# 'Addicional weather source available online: http://agebb.missouri.edu/weather/history/index.asp?station_prefix=bfd',
# 'http://weather.uga.edu/mindex.php?content=calculator&variable=CC&site=TIFTON',
# [e for e in list(set(list(cmnt.loc[cmnt.Year != "2022", 'value']))) if e not in cmnts_2022]
Fix Missing GPS Coordinates.
# Fix GPS coordinates
gps_cols = ['Latitude_of_Field_Corner_#1 (lower left)',
'Latitude_of_Field_Corner_#2 (lower right)',
'Latitude_of_Field_Corner_#3 (upper right)',
'Latitude_of_Field_Corner_#4 (upper left)',
'Longitude_of_Field_Corner_#1 (lower left)',
'Longitude_of_Field_Corner_#2 (lower right)',
'Longitude_of_Field_Corner_#3 (upper right)',
'Longitude_of_Field_Corner_#4 (upper left)',
'Weather_Station_Latitude (in decimal numbers NOT DMS)',
'Weather_Station_Longitude (in decimal numbers NOT DMS)']
gps = meta.loc[:, ['Env']+gps_cols]
# exclude sites outside of north america (side benefit of disqualifying wrongly input data from TXH1_2021)
# Logitude must be
longitude_max = -60
latitude_max = 45
for col in gps_cols:
print("Cleaning: '"+col+"'")
if re.match('.*Latitude.+', col):
mask = gps[col ]>latitude_max
elif re.match('.*Longitude.+', col):
mask = gps[col ]>longitude_max
print("Erasing values in: '"+"', '".join(list(set(list(gps.loc[mask, 'Env']))))+"'")
gps.loc[mask, col] = np.nan
# print('\n')
Cleaning: 'Latitude_of_Field_Corner_#1 (lower left)' Erasing values in: '' Cleaning: 'Latitude_of_Field_Corner_#2 (lower right)' Erasing values in: '' Cleaning: 'Latitude_of_Field_Corner_#3 (upper right)' Erasing values in: '' Cleaning: 'Latitude_of_Field_Corner_#4 (upper left)' Erasing values in: '' Cleaning: 'Longitude_of_Field_Corner_#1 (lower left)' Erasing values in: '' Cleaning: 'Longitude_of_Field_Corner_#2 (lower right)' Erasing values in: '' Cleaning: 'Longitude_of_Field_Corner_#3 (upper right)' Erasing values in: 'TXH1_2021' Cleaning: 'Longitude_of_Field_Corner_#4 (upper left)' Erasing values in: 'TXH1_2021' Cleaning: 'Weather_Station_Latitude (in decimal numbers NOT DMS)' Erasing values in: '' Cleaning: 'Weather_Station_Longitude (in decimal numbers NOT DMS)' Erasing values in: ''
# melt and get field center
gps = gps.melt(id_vars=['Env'])
mask = gps.variable.isin([
'Latitude_of_Field_Corner_#1 (lower left)',
'Latitude_of_Field_Corner_#2 (lower right)',
'Latitude_of_Field_Corner_#3 (upper right)',
'Latitude_of_Field_Corner_#4 (upper left)'
])
gps.loc[mask, 'variable'] = 'Latitude_of_Field'
mask = gps.variable.isin([
'Longitude_of_Field_Corner_#1 (lower left)',
'Longitude_of_Field_Corner_#2 (lower right)',
'Longitude_of_Field_Corner_#3 (upper right)',
'Longitude_of_Field_Corner_#4 (upper left)'
])
gps.loc[mask, 'variable'] = 'Longitude_of_Field'
# collapse measures for the field
gps = gps.groupby(['Env', 'variable']).agg(value = ('value', np.nanmean)).reset_index()
# if we have more accurate information, don't factor the weather station location into the estimate of the field location
gps['Replace'] = False
gps.loc[gps.value.isna(), 'Replace'] = True
mask = ((~(gps.Replace)) & (gps.variable == 'Weather_Station_Latitude (in decimal numbers NOT DMS)'))
gps.loc[mask, 'value'] = np.nan
mask = ((~(gps.Replace)) & (gps.variable == 'Weather_Station_Longitude (in decimal numbers NOT DMS)'))
gps.loc[mask, 'value'] = np.nan
gps = gps.drop(columns = ['Replace'])
# repeat the same trick to use the weather station info if the field info is not known
mask = gps.variable.isin([
'Weather_Station_Latitude (in decimal numbers NOT DMS)',
'Latitude_of_Field'
])
gps.loc[mask, 'variable'] = 'Latitude_of_Field'
mask = gps.variable.isin([
'Weather_Station_Longitude (in decimal numbers NOT DMS)',
'Longitude_of_Field'
])
gps.loc[mask, 'variable'] = 'Longitude_of_Field'
gps = gps.groupby(['Env', 'variable']).agg(value = ('value', np.nanmean)).reset_index()
gps = gps.pivot(index='Env', columns='variable', values='value').reset_index()
# Some envs have no gps info, impute those based on similarly named locations
# There is only one TXH4 group (TXH4_2019), so the below approach doesn't work.
# I'll use all the TXH* sites to guess the coordinates.
mask = [True if re.match("TXH.+", e) else False for e in gps['Env']]
gps.loc[(gps.Env == 'TXH4_2019'), 'Latitude_of_Field'] = np.nanmedian(gps.loc[mask, 'Latitude_of_Field'])
gps.loc[(gps.Env == 'TXH4_2019'), 'Longitude_of_Field'] = np.nanmedian(gps.loc[mask, 'Longitude_of_Field'])
# Impute all non-TXH4 locations
gps[['EnvBase', 'EnvYear']] = gps.Env.str.split("_", expand = True)
mask_no_lat = gps.Latitude_of_Field.isna()
mask_no_lon = gps.Longitude_of_Field.isna()
impute_gps_vals = list(gps.loc[(mask_no_lat | mask_no_lon), 'Env'])
# for each Env with missing values, use the first portion of the name
# e.g. TXH1-Early_2017 -> TXH1 to search for possible matches
for impute_gps_val in impute_gps_vals:
mask_gps_val = (gps.Env == impute_gps_val)
match_root = gps.loc[mask_gps_val, 'EnvBase'].str.split('-')
match_root = list(match_root)[0][0]
# print()
mask = [True if re.match(e, match_root+'.+') else False for e in gps['EnvBase']]
check_std_lat = round(np.nanstd(gps.loc[mask, 'Latitude_of_Field']), 3)
check_std_lon = round(np.nanstd(gps.loc[mask, 'Longitude_of_Field']),3)
gps.loc[mask_gps_val, 'Latitude_of_Field'] = np.nanmedian(gps.loc[mask, 'Latitude_of_Field'])
gps.loc[mask_gps_val, 'Longitude_of_Field'] = np.nanmedian(gps.loc[mask, 'Longitude_of_Field'])
gps = gps.drop(columns = ['EnvBase', 'EnvYear'])
assert sum(gps.Latitude_of_Field.isna()) == 0
assert sum(gps.Longitude_of_Field.isna()) == 0
print("All GPS Coordinates Imputed!")
meta = meta.drop(columns = gps_cols).merge(gps)
print("And replaced in `meta`")
All GPS Coordinates Imputed! And replaced in `meta`
# set(phno.Hybrid)
obs_summary = phno.loc[:, ['Env', 'Year', 'Hybrid', 'Yield_Mg_ha']].merge(meta.loc[:, ['Env', 'Latitude_of_Field', 'Longitude_of_Field']].drop_duplicates())
distance_threshold = 1
# Create clusters with lat/lon
temp = obs_summary.loc[:, ['Env', 'Latitude_of_Field', 'Longitude_of_Field']
].drop_duplicates(
).reset_index(
).drop(columns = 'index')
temp['GPS_Group'] = ""
temp['Distance'] = np.nan
gps_group_counter = 0
for i in temp.index:
match_lat = temp.loc[i, 'Latitude_of_Field']
match_lon = temp.loc[i, 'Longitude_of_Field']
if temp.loc[i, 'GPS_Group'] == '':
temp['Distance'] = np.sqrt( (temp['Latitude_of_Field'] - float(match_lat))**2 + (temp['Longitude_of_Field'] - float(match_lon))**2 )
temp.loc[(temp.Distance <= distance_threshold), 'GPS_Group'] = str(gps_group_counter)
gps_group_counter += 1
obs_summary = obs_summary.merge(temp.loc[:, ['Env', 'Latitude_of_Field', 'Longitude_of_Field', 'GPS_Group']])
gps_groups_2022 = list(set(obs_summary.loc[(obs_summary.Year == '2022'), 'GPS_Group']))
mask_pre_2022 = (obs_summary.Year != '2022')
mask_gps_match = (obs_summary.GPS_Group.isin(gps_groups_2022))
def quick_obs_summary_tally(df = obs_summary.loc[:, ['Year', 'Env']]):
df = df.groupby(['Year']
).count(
).reset_index(
).assign(
Pct = lambda dataframe: round(dataframe['Env']/np.sum(dataframe['Env']), 4)*100,
Total = lambda dataframe: np.sum(dataframe['Env']))
return(df)
# Baseline: the whole dataset
quick_obs_summary_tally(df = obs_summary.loc[:, ['Year', 'Env']])
| Year | Env | Pct | Total | |
|---|---|---|---|---|
| 0 | 2014 | 12675 | 8.68 | 146057 |
| 1 | 2015 | 13688 | 9.37 | 146057 |
| 2 | 2016 | 15387 | 10.53 | 146057 |
| 3 | 2017 | 15533 | 10.63 | 146057 |
| 4 | 2018 | 20851 | 14.28 | 146057 |
| 5 | 2019 | 20306 | 13.90 | 146057 |
| 6 | 2020 | 16440 | 11.26 | 146057 |
| 7 | 2021 | 19622 | 13.43 | 146057 |
| 8 | 2022 | 11555 | 7.91 | 146057 |
# The full possible testing dataset
obs_pre_2022 = quick_obs_summary_tally(df = obs_summary.loc[mask_pre_2022, ['Year', 'Env']])
obs_pre_2022
| Year | Env | Pct | Total | |
|---|---|---|---|---|
| 0 | 2014 | 12675 | 9.42 | 134502 |
| 1 | 2015 | 13688 | 10.18 | 134502 |
| 2 | 2016 | 15387 | 11.44 | 134502 |
| 3 | 2017 | 15533 | 11.55 | 134502 |
| 4 | 2018 | 20851 | 15.50 | 134502 |
| 5 | 2019 | 20306 | 15.10 | 134502 |
| 6 | 2020 | 16440 | 12.22 | 134502 |
| 7 | 2021 | 19622 | 14.59 | 134502 |
# The testing dataset constrained to gps groups in 2022
obs_pre_2022_gps = quick_obs_summary_tally(df = obs_summary.loc[(mask_pre_2022 & mask_gps_match), ['Year', 'Env']])
obs_diff = obs_pre_2022_gps.loc[0, 'Total'] - obs_pre_2022.loc[0, 'Total']
obs_pct = 100*round(obs_diff/obs_pre_2022.loc[0, 'Total'], 4)
print(str(obs_diff)+' fewer obs\n'+str(obs_pct)+'% fewer obs')
obs_pre_2022_gps['Diff'] = obs_pre_2022_gps['Env']
obs_pre_2022_gps['Diff'] = obs_pre_2022_gps['Diff'] - obs_pre_2022['Env']
obs_pre_2022_gps
-11320 fewer obs -8.42% fewer obs
| Year | Env | Pct | Total | Diff | |
|---|---|---|---|---|---|
| 0 | 2014 | 11643 | 9.45 | 123182 | -1032 |
| 1 | 2015 | 11219 | 9.11 | 123182 | -2469 |
| 2 | 2016 | 12717 | 10.32 | 123182 | -2670 |
| 3 | 2017 | 12646 | 10.27 | 123182 | -2887 |
| 4 | 2018 | 19177 | 15.57 | 123182 | -1674 |
| 5 | 2019 | 19806 | 16.08 | 123182 | -500 |
| 6 | 2020 | 16440 | 13.35 | 123182 | 0 |
| 7 | 2021 | 19534 | 15.86 | 123182 | -88 |
This doesn't remove too many observations from recent years but it's almost equivalent to a whole year. I think I'll not restrict the training set to only matching gps groups.
Because I'll be ensembling models, I need multiple testing sets for
Hyperparameter |-> Training |-> Ensemble |-> Predictions
Selection | | Tuning |
- 1 year | | |
import itertools # for making test set permutations
testing_years = pd.DataFrame(
[e for e in itertools.permutations([2014+i for i in range(8)], 3)],
columns = ['Test_HPS', 'Test_Model', 'Test_Ensemble'])
rng = np.random.default_rng(2039476435238045723476)
testing_years['Random_Order'] = rng.permutation(testing_years.shape[0])
testing_years.sort_values('Random_Order')
| Test_HPS | Test_Model | Test_Ensemble | Random_Order | |
|---|---|---|---|---|
| 316 | 2021 | 2017 | 2019 | 0 |
| 28 | 2014 | 2019 | 2020 | 1 |
| 230 | 2019 | 2017 | 2016 | 2 |
| 162 | 2017 | 2021 | 2014 | 3 |
| 56 | 2015 | 2017 | 2018 | 4 |
| ... | ... | ... | ... | ... |
| 49 | 2015 | 2016 | 2017 | 331 |
| 35 | 2014 | 2020 | 2021 | 332 |
| 65 | 2015 | 2018 | 2021 | 333 |
| 52 | 2015 | 2016 | 2020 | 334 |
| 67 | 2015 | 2019 | 2016 | 335 |
336 rows × 4 columns
list(soil)
summarize_col_missing(soil)
| Col | N_miss | Pr_Comp | |
|---|---|---|---|
| 0 | Env | 0 | 100.0 |
| 1 | Year | 0 | 100.0 |
| 2 | LabID | 81 | 66.2 |
| 3 | Date Received | 98 | 59.2 |
| 4 | Date Reported | 81 | 66.2 |
| 5 | E Depth | 96 | 60.0 |
| 6 | 1:1 Soil pH | 83 | 65.4 |
| 7 | WDRF Buffer pH | 100 | 58.3 |
| 8 | 1:1 S Salts mmho/cm | 100 | 58.3 |
| 9 | Texture No | 97 | 59.6 |
| 10 | Organic Matter LOI % | 89 | 62.9 |
| 11 | Nitrate-N ppm N | 98 | 59.2 |
| 12 | lbs N/A | 100 | 58.3 |
| 13 | Potassium ppm K | 81 | 66.2 |
| 14 | Sulfate-S ppm S | 100 | 58.3 |
| 15 | Calcium ppm Ca | 96 | 60.0 |
| 16 | Magnesium ppm Mg | 96 | 60.0 |
| 17 | Sodium ppm Na | 98 | 59.2 |
| 18 | CEC/Sum of Cations me/100g | 100 | 58.3 |
| 19 | %H Sat | 100 | 58.3 |
| 20 | %K Sat | 98 | 59.2 |
| 21 | %Ca Sat | 98 | 59.2 |
| 22 | %Mg Sat | 98 | 59.2 |
| 23 | %Na Sat | 98 | 59.2 |
| 24 | Mehlich P-III ppm P | 81 | 66.2 |
| 25 | % Sand | 105 | 56.2 |
| 26 | % Silt | 105 | 56.2 |
| 27 | % Clay | 105 | 56.2 |
| 28 | Texture | 103 | 57.1 |
| 29 | BpH | 229 | 4.6 |
| 30 | Zinc ppm Zn | 237 | 1.2 |
| 31 | Iron ppm Fe | 237 | 1.2 |
| 32 | Manganese ppm Mn | 237 | 1.2 |
| 33 | Copper ppm Cu | 237 | 1.2 |
| 34 | Boron ppm B | 237 | 1.2 |
| 35 | Comments | 214 | 10.8 |
# drop low completion rate entries
temp = summarize_col_missing(df = soil)
# high percent complete columns
high_pr_comp_cols = list(temp.loc[(temp.Pr_Comp) > 50, # this threshold used to be 70, but the additon of envs
# which did not have soil measured deflate the completion rates
'Col'])
soil = soil.loc[:, high_pr_comp_cols].drop(columns = [
'LabID', #|- Not interested in these columns
'Date Received', #|
'Date Reported']) #|
# write out a log of the enviroments imputed
def log_imputed_envs(
df = meta,
df_name = 'meta',
col = 'Date_Planted'
):
mask = df[col].isna()
df = df.loc[mask, ['Env']].drop_duplicates().reset_index().drop(columns = ['index'])
df.to_csv('./data/Preparation/'+df_name+'_Envs_imp_'+col+'.csv', index=False)
# discard columns that have low completion or redundant information
meta = meta.drop(columns = [e for e in list(meta) if e in [
'Experiment_Code',
'Treatment',
'City',
'Farm',
'Field',
'Trial_ID (Assigned by collaborator for internal reference)',
'Soil_Taxonomic_ID and horizon description, if known',
'Weather_Station_Serial_Number (Last four digits, e.g. m2700s#####)',
'Type_of_planter (fluted cone; belt cone; air planter)',
'In-season_tillage_method(s)', # 34% Pr_Comp
'Plot_Area_ha', # 92.1 % complete but not a covariate I want to use
'System_Determining_Moisture',
'Cardinal_Heading_Pass_1',
'Irrigated']])
summarize_col_missing(meta)
| Col | N_miss | Pr_Comp | |
|---|---|---|---|
| 0 | Env | 0 | 100.0 |
| 1 | Year | 0 | 100.0 |
| 2 | Hybrid | 0 | 100.0 |
| 3 | Date_Planted | 11917 | 91.8 |
| 4 | Date_Harvested | 12680 | 91.3 |
| 5 | Previous_Crop | 17424 | 88.1 |
| 6 | Pre-plant_tillage_method(s) | 27447 | 81.2 |
| 7 | Pounds_Needed_Soil_Moisture | 44351 | 69.6 |
| 8 | Latitude_of_Field | 0 | 100.0 |
| 9 | Longitude_of_Field | 0 | 100.0 |
log_imputed_envs(
df = meta,
df_name = 'meta',
col = 'Previous_Crop'
)
# need data dicts before I can encode
Previous_Crop = {
'soybean': 'soy',
'cotton': 'cotton',
'wheat': 'wheat',
'wheat/double crop soybean': 'soy_wheat',
'corn': 'corn',
'Lima beans followed by rye cover crop': 'lima_rye',
'sorghum': 'sorghum',
'Winter wheat': 'wheat',
'Fallow most of 2014 winter planted in fall of 2014 then sprayed with Glystar 24 floz/a on 5/3/15 and killed spring of 2015 spray ': 'fallow',
'peanut': 'peanut',
'wheat and Double Crop soybean': 'soy_wheat',
'sugar beet': 'beet',
'Small Grains and Double Crop soybean': 'soy_rye',
'soybean/pumpkin': 'soy_pumpkin',
'wheat/soybean': 'soy_wheat',
'soybeans with fall cereal rye cover': 'soy_rye'
}
set('_'.join(list(set([Previous_Crop[e] for e in Previous_Crop.keys()]))).split('_'))
{'beet',
'corn',
'cotton',
'fallow',
'lima',
'peanut',
'pumpkin',
'rye',
'sorghum',
'soy',
'wheat'}
temp = pd.DataFrame(
zip([e for e in Previous_Crop.keys()],
[Previous_Crop[e] for e in Previous_Crop.keys()]),
columns = ['Previous_Crop', 'Value'])
for crop in ['beet', 'corn', 'cotton', 'fallow', 'lima',
'peanut', 'pumpkin', 'rye', 'sorghum', 'soy',
'wheat']:
temp['Cover_'+crop] = [1 if re.search(crop, e) else 0 for e in temp['Value']]
meta = meta.merge(temp, how = 'outer').drop(columns = ['Value'])
meta = meta.drop(columns = 'Previous_Crop')
log_imputed_envs(
df = meta,
df_name = 'meta',
col = 'Pre-plant_tillage_method(s)'
)
Pre_plant_tillage_method = {
'Conventional': 'Cult',
'Disc in previous fall': 'Disc',
'conventional': 'Cult',
'field cultivator': 'Cult',
'Fall Chisel': 'Chisel',
'Fall chisel plow and spring field cultivate': 'Chisel_Cult',
'chisel': 'Chisel',
'No-till': 'None',
'Chisel plow and field cultivator': 'Chisel_Cult',
'chisel plow in fall; field cultivated in spring': 'Chisel_Cult',
'In the Spring the land was cut with a disk, then ripped with a chisel plow to a depth of 8-10”. It was then cut again and we applied 300#/acre of 10-0-30-12%S. Next we used a field cultivator with rolling baskets to incorporate the fertilizer. The land was bedded just prior to planting.': 'Chisel_Cult_Disc',
'no-till': 'None',
'Field J was fall moldboard plow; Then disked this spring and field cultivated before planting.': 'Chisel_Cult_Disc',
'The field was minium tilled. The field was disked then cultipacked then Cultimulched then planted': 'Cult_Disc_MinTill',
'Fall Chisel Plow; Spring Cultivate': 'Chisel_Cult',
'min-till': 'MinTill',
'Field cultivator': 'Cult',
'Field cultivate': 'Cult',
'No-Till': 'None',
'fall chisel plow, spring field cultivator': 'Chisel_Cult',
'disc, conventional, followed by bedding': 'Disc_Cult',
'No Till': 'None',
'Chisel plowed 5/4/15 Disc and finishing tool 5/6/15 ': 'Chisel_Disc',
'Chisel plowed 5/7/15 disc and field finisher 5/23/15': 'Chisel_Disc',
'Fall Chisel, Spring Turbo-Till': 'Chisel_Cult',
'Min-Till': 'MinTill',
'Cultivate, hip and row': 'Cult',
'Conventional disc tillage': 'Disc_Cult',
'Field cultivated': 'Cult',
'Chisel Plow': 'Chisel',
'Conventional Tillage': 'Cult',
'Moldboard plowed November\n': 'Chisel',
'Fall Diskchisel, Spring Culivator': 'Disc_Cult',
'Cultivator ': 'Cult',
'Cultivate': 'Cult',
'Min-Till ': 'MinTill',
'Field Cultivator': 'Cult',
'cultivate, hip, row': 'Cult',
'none': 'None',
'Chisel': 'Chisel',
'Fall plow/Spring field cultivator': 'Chisel_Cult',
'disk and hip': 'Disc',
'till and hip': 'Chisel',
'1 pass with soil finisher': 'Cult',
'disked, chisel plow and field cultivator': 'Chisel_Disc_Cult',
'harrowed, rototilled': 'Cult',
'disc': 'Disc',
'Two passes with disk, one pass with field conditioner, 30Ó beds were made with 8 row ripper-bedder for corn': 'Disc_Cult',
'Fall Diskchisel / Spring Culivator': 'Chisel_Cult',
'chisel plow': 'Chisel',
'2 passes with a field cultivator': 'Cult',
'disk': 'Disc',
'Chisel plow,cultivate': 'Chisel_Cult',
'Fall soil chisel, spring cultivator': 'Chisel_Cult',
'field cultivate': 'Cult',
'cultivate, hip and row': 'Cult',
'disked, ripped, field cultivator': 'Disc_Cult',
'Ripper Bed, rototill': 'Chisel',
'standard (disk plow)': 'Disc_Chisel_Cult',
'spring field cultivator': 'Cult',
'fall disk chisel, spring cultivator': 'Chisel_Cult',
'chisel, field cultivate': 'Chisel_Cult',
'Chisel plow followed by cultivator': 'Chisel_Cult',
'field cultivate (twice)': 'Cult',
'Chisel plow': 'Chisel',
'Chisel plowed on 12/14/17': 'Chisel',
'Heavy disk, Chisel plow, Field cultivator': 'Chisel_Disc_Cult',
'Disked, chisil, disk': 'Chisel_Disc_Cult',
'Ripper Bed, Rototill': 'Cult',
'disked and field conditioned': 'Disc',
'field cultivate ': 'Cult',
'Case IH 335 VT 4" deep ': 'Cult',
'Chisel - field cultivator': 'Cult',
'Chisel plow followed by cultivator': 'Chisel_Cult',
'CaseIH VT 360 vertical tillage tool gone over 2X on 5/22': 'Cult',
'CaseIH VT 360 vertical tillage tool gone over 2X on 5/23': 'Cult',
'Fall Diskchisel, Spring Disk': 'Disc',
'Cultivator': 'Cult',
'disc harrow followed by chisel plow, field cultivator used to prepare final seed bed': 'Disc_Chisel_Cult',
'harrow, ripper bed, rototill': 'Cult',
'soil finisher': 'None',
'conventional, heavy disc, chisel plow, field cultivator': 'Disc_Chisel_Cult',
'cultivate 2x': 'Cult',
'cultivator': 'Cult',
'Conventional tillage (disc) + ripped and bedded rows': 'Disc',
'Heavy Disk and then rows placed': 'Disc',
'Chisel plow, disk, field cultivator': 'Disc_Chisel_Cult',
'Harrow, ripper bed, rototill': 'Cult',
'Field cultivator. Tillage with soil finisher': 'Cult',
'Strip tillage': 'Cult',
'Fall rip one pass tool, spring field cultivate': 'Cult',
'conventional - Field Cultivator, disk': 'Cult',
'None': 'None',
'CaseIH VT 360 vertical tillage tool gone over 2X on 5/18/2021': 'Cult',
'CaseIH VT 360 vertical tillage tool gone over 2X on 5/18': 'Cult',
'Conventional, heavy disc, chisel plow, field cultivator': 'Disc_Chisel_Cult',
'Disked the whole field and then rows were placed': 'Disc',
'Disc, Dynadrive': 'Disc',
'ripper bed, rototill': 'Cult',
'Disk': 'Disc',
'Disk, field cultivator, ripper/bedder': 'Disc_Chisel_Cult',
'Fall disk chisel, Spring cultivator': 'Disc_Chisel_Cult',
'conventional, heavy disc, chisel plow, and field cultivator': 'Disc_Chisel_Cult',
'Discing': 'Disc_Chisel_Cult'
}
set('_'.join(list(set([Pre_plant_tillage_method[e] for e in Pre_plant_tillage_method.keys()]))).split('_'))
{'Chisel', 'Cult', 'Disc', 'MinTill', 'None'}
temp = pd.DataFrame(
zip([e for e in Pre_plant_tillage_method.keys()],
[Pre_plant_tillage_method[e] for e in Pre_plant_tillage_method.keys()]),
columns = ['Pre-plant_tillage_method(s)', 'Value'])
temp['Pre_Chisel'] = [1 if re.search('Chisel', e) else 0 for e in temp['Value']]
temp['Pre_Cult'] = [1 if re.search('Cult', e) else 0 for e in temp['Value']]
temp['Pre_Disc'] = [1 if re.search('Disc', e) else 0 for e in temp['Value']]
temp['Pre_MinTill'] = [1 if re.search('MinTill', e) else 0 for e in temp['Value']]
temp
| Pre-plant_tillage_method(s) | Value | Pre_Chisel | Pre_Cult | Pre_Disc | Pre_MinTill | |
|---|---|---|---|---|---|---|
| 0 | Conventional | Cult | 0 | 1 | 0 | 0 |
| 1 | Disc in previous fall | Disc | 0 | 0 | 1 | 0 |
| 2 | conventional | Cult | 0 | 1 | 0 | 0 |
| 3 | field cultivator | Cult | 0 | 1 | 0 | 0 |
| 4 | Fall Chisel | Chisel | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 99 | Disk | Disc | 0 | 0 | 1 | 0 |
| 100 | Disk, field cultivator, ripper/bedder | Disc_Chisel_Cult | 1 | 1 | 1 | 0 |
| 101 | Fall disk chisel, Spring cultivator | Disc_Chisel_Cult | 1 | 1 | 1 | 0 |
| 102 | conventional, heavy disc, chisel plow, and fie... | Disc_Chisel_Cult | 1 | 1 | 1 | 0 |
| 103 | Discing | Disc_Chisel_Cult | 1 | 1 | 1 | 0 |
104 rows × 6 columns
meta = meta.merge(temp, how = 'outer').drop(columns = ['Pre-plant_tillage_method(s)', 'Value'])
log_imputed_envs(
df = meta,
df_name = 'meta',
col = 'Pounds_Needed_Soil_Moisture'
)
meta = sanitize_col(df = meta, col = 'Pounds_Needed_Soil_Moisture',
simple_renames= {
'Unknown': '-9999',
'Unknown, currently getting in contact with manufacturer, technician estimated about 5 lbs of grain to get moisture reading':'5',
'4 or 5':'4.5',
'~2.5':'2.5',
"Based on the technitian's experience a minimum of 4 lbs is required. However the user manual says the minimum volume for accurate determination is 2 liters":'4',
'Depend on moisture content of grain 15.5% moisture 5.84 lbs 30% moisture 7.05 lbs':'6.445',
'2.5 lbs.':'2.5',
'5-6.5':'5.75',
'3 to 4':'3.5',
'~5 lbs':'5',
'~10 lbs':'10',
'7 to 9':'8',
'<1': '0.5'
},
split_renames= {})
meta['Pounds_Needed_Soil_Moisture'] = meta.loc[:, 'Pounds_Needed_Soil_Moisture'].astype(float)
mask = meta['Pounds_Needed_Soil_Moisture'] == -9999
meta.loc[mask, 'Pounds_Needed_Soil_Moisture'] = np.nan
pre_plant_cols = [
'Pre_Chisel',
'Pre_Cult',
'Pre_Disc',
'Pre_MinTill']
cover_crop_cols = [
'Cover_beet',
'Cover_corn',
'Cover_cotton',
'Cover_fallow',
'Cover_lima',
'Cover_peanut',
'Cover_pumpkin',
'Cover_rye',
'Cover_sorghum',
'Cover_soy',
'Cover_wheat']
# mode impute the one hot encoded variables
from scipy import stats # for stats.mode for imputation
for col in pre_plant_cols+cover_crop_cols:
temp = meta.loc[:, ['Env', col]].drop_duplicates()
imp_val = stats.mode(temp[col], keepdims = False).mode
mask = meta[col].isna()
meta.loc[mask, col] = imp_val
# median impute soil moisture
mask = meta['Pounds_Needed_Soil_Moisture'].isna()
meta.loc[mask, 'Pounds_Needed_Soil_Moisture'] = np.nanmedian((meta['Pounds_Needed_Soil_Moisture']))
# Impute soil based on GPS coordinates
# Use distance between lon/lat to fill in missing values with the nearest.
# I'm using euclidean distance of lon/lat so this is not perfect.
temp = meta.loc[:, ['Env', 'Latitude_of_Field', 'Longitude_of_Field']].drop_duplicates()
soil = soil.merge(temp).drop(columns = 'Texture') # with sand silt clay percents this is redundant info
check_cols = [e for e in list(soil) if e not in ['Env', 'Year', 'Latitude_of_Field', 'Longitude_of_Field']]
for check_col in check_cols:
# check_col = check_cols[0]
mask = soil[check_col].isna()
fill_envs = soil.loc[mask, 'Env']
for fill_env in fill_envs:
# fill_env = fill_envs[0]
if np.dtype(soil[check_col]) != 'O':
match_lat = soil.loc[(soil.Env == fill_env), 'Latitude_of_Field']
match_lon = soil.loc[(soil.Env == fill_env), 'Longitude_of_Field']
soil['Distance'] = np.sqrt( ((soil['Latitude_of_Field'] - float(match_lat))**2
) + ((soil['Longitude_of_Field'] - float(match_lon))**2))
dist_min = np.nanmin(soil.loc[(soil.Env != fill_env), 'Distance'])
# print(dist_min)
dist_mask = soil.Distance == dist_min
soil.loc[(soil.Env == fill_env), check_col] = np.nanmedian(soil.loc[dist_mask, check_col])
soil = soil.drop(columns = ['Latitude_of_Field', 'Longitude_of_Field', 'Distance'])
/home/labmember/miniconda3/envs/g2f_comp/lib/python3.10/site-packages/numpy/lib/nanfunctions.py:1217: RuntimeWarning: All-NaN slice encountered r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
# Impute soil without GPS coordinates
imputer = KNNImputer(n_neighbors=3)
knn_imputed = pd.DataFrame(
imputer.fit_transform(
soil.drop(columns = ['Env', 'Year'])))
knn_imputed.columns = [e for e in list(soil) if e not in ['Env', 'Year']]
soil = pd.concat([soil.loc[:, ['Env', 'Year']], knn_imputed], axis = 1)
# on WSL this was fine, but on linux it is asserting an error desipte there being none below 100.0. Below works just fine.
# assert False not in (summarize_col_missing(soil).loc[:, 'Pr_Comp'] == 100)
# Below works as expected.
assert False not in [True if e == 100 else False for e in list(summarize_col_missing(soil).loc[:, 'Pr_Comp'])]
print("No missing values in `soil`")
No missing values in `soil`
def dl_power_data(
latitude = 32.929,
longitude = -95.770,
start_YYYYMMDD = 20150101,
end_YYYYMMDD = 20150305
):
# Modified by
# https://power.larc.nasa.gov/docs/tutorials/service-data-request/api/
'''
*Version: 2.0 Published: 2021/03/09* Source: [NASA POWER](https://power.larc.nasa.gov/)
POWER API Multi-Point Download
This is an overview of the process to request data from multiple data points from the POWER API.
'''
base_url = r"https://power.larc.nasa.gov/api/temporal/daily/point?parameters=QV2M,T2MDEW,PS,RH2M,WS2M,GWETTOP,ALLSKY_SFC_SW_DWN,ALLSKY_SFC_PAR_TOT,T2M_MAX,T2M_MIN,T2MWET,GWETROOT,T2M,GWETPROF,ALLSKY_SFC_SW_DNI,PRECTOTCORR&community=RE&longitude={longitude}&latitude={latitude}&start={start_YYYYMMDD}&end={end_YYYYMMDD}&format=JSON"
api_request_url = base_url.format(
longitude=longitude,
latitude=latitude,
start_YYYYMMDD=start_YYYYMMDD,
end_YYYYMMDD=end_YYYYMMDD)
response = requests.get(url=api_request_url, verify=True, timeout=30.00)
content = json.loads(response.content.decode('utf-8'))
# Repackage content as data frame
df_list = [
pd.DataFrame(content['properties']['parameter'][e], index = [0]).melt(
).rename(columns = {'variable':'Date', 'value':e})
for e in list(content['properties']['parameter'].keys())
]
for i in range(len(df_list)):
if i == 0:
out = df_list[i]
else:
out = out.merge(df_list[i])
out['Latitude'] = latitude
out['Longitude'] = longitude
first_cols = ['Latitude', 'Longitude', 'Date']
out = out.loc[:, first_cols+[e for e in list(out) if e not in first_cols]]
return(out)
# dl_power_data(
# latitude = 32.929,
# longitude = -95.770,
# start_YYYYMMDD = 20150101,
# end_YYYYMMDD = 20150305
# )
# for reference, here's some info on the structure of contents
# dict_keys(['type', 'geometry', 'properties', 'header', 'messages', 'parameters', 'times'])
# content['header']
# {'title': 'NASA/POWER CERES/MERRA2 Native Resolution Daily Data',
# 'api': {'version': 'v2.3.5', 'name': 'POWER Daily API'},
# 'sources': ['power', 'ceres', 'merra2'],
# 'fill_value': -999.0,
# 'start': '20150101',
# 'end': '20150305'}
# content['parameters']
# {'QV2M': {'units': 'g/kg', 'longname': 'Specific Humidity at 2 Meters'},
# 'T2MDEW': {'units': 'C', 'longname': 'Dew/Frost Point at 2 Meters'},
# 'PS': {'units': 'kPa', 'longname': 'Surface Pressure'},
# 'RH2M': {'units': '%', 'longname': 'Relative Humidity at 2 Meters'},
# 'WS2M': {'units': 'm/s', 'longname': 'Wind Speed at 2 Meters'},
# 'GWETTOP': {'units': '1', 'longname': 'Surface Soil Wetness'},
# 'ALLSKY_SFC_SW_DWN': {'units': 'kW-hr/m^2/day',
# 'longname': 'All Sky Surface Shortwave Downward Irradiance'},
# 'ALLSKY_SFC_PAR_TOT': {'units': 'W/m^2',
# 'longname': 'All Sky Surface PAR Total'},
# 'T2M_MAX': {'units': 'C', 'longname': 'Temperature at 2 Meters Maximum'},
# 'T2M_MIN': {'units': 'C', 'longname': 'Temperature at 2 Meters Minimum'},
# 'T2MWET': {'units': 'C', 'longname': 'Wet Bulb Temperature at 2 Meters'},
# 'GWETROOT': {'units': '1', 'longname': 'Root Zone Soil Wetness'},
# 'T2M': {'units': 'C', 'longname': 'Temperature at 2 Meters'},
# 'GWETPROF': {'units': '1', 'longname': 'Profile Soil Moisture'},
# 'ALLSKY_SFC_SW_DNI': {'units': 'kW-hr/m^2/day',
# 'longname': 'All Sky Surface Shortwave Downward Direct Normal Irradiance'},
# 'PRECTOTCORR': {'units': 'mm/day', 'longname': 'Precipitation Corrected'}}
# this shows that the downloaded values and given values are almost in perfect
# agreement (ALLSKY_SFC_SW_DWN, ALLSKY_SFC_SW_DNI) are the only discrepencies
# for the test case. I'll list out those sites with errors and selectively
# download them and fill in missing values. To be polite, check if the weather
# data already exists before drawing from POWER's API.
# NOTE. Most, but not all of the missing values are due to the release date.
if True == False:
wthr_DEH1_2014 = wthr.loc[wthr.Env == 'DEH1_2014', :]
lat, lon = list(meta.loc[meta.Env == 'DEH1_2014', ['Latitude_of_Field', 'Longitude_of_Field']].loc[0, :])
powr_DEH1_2014 = dl_power_data(
latitude = lat,
longitude = lon,
start_YYYYMMDD = np.min(wthr_DEH1_2014.Date),
end_YYYYMMDD = np.max(wthr_DEH1_2014.Date)
)
pd.DataFrame(
[(e, (np.mean(wthr_DEH1_2014[e] - powr_DEH1_2014[e]))
) for e in list(wthr_DEH1_2014) if e not in ['Env', 'Year', 'Date']
], columns = ['Measure', 'Total_Difference']
)
polite_request_interval = 10 # time in seconds
# check if clean weather exists, load if it does, download and fill in if not
if 'wthr_powr_imp.csv' in os.listdir('./data/Preparation/'):
print('Reading Weather from Preparation/')
wthr = pd.read_csv('./data/Preparation/wthr_powr_imp.csv')
else:
print('Filling in Weather from NASA Power')
# Figure out what Envs need to be imputed
temp = wthr
temp = temp.drop(columns = ['Year', 'Date'])
for col in [e for e in list(temp) if e != 'Env']:
temp[col] = temp[col].isna()
temp = temp.drop_duplicates()
temp['Num_Rep'] = temp.drop(columns = ['Env']).sum(axis = 1)
impute_envs = list(temp.loc[(temp.Num_Rep > 0), 'Env'].drop_duplicates())
# impute_envs
for impute_env in impute_envs:
# impute_env = 'WIH3_2022'#impute_envs[0]
meta_mask = (meta.Env == impute_env)
lat = float(meta.loc[meta_mask, 'Latitude_of_Field'].drop_duplicates())
lon = float(meta.loc[meta_mask, 'Longitude_of_Field'].drop_duplicates())
wthr_mask = (wthr.Env == impute_env)
date_min = np.min(wthr.loc[wthr_mask, 'Date'])
date_max = np.max(wthr.loc[wthr_mask, 'Date'])
# Delay between requests
if impute_env != impute_envs[0]:
time.sleep(polite_request_interval)
powr_dl = dl_power_data(
latitude = lat,
longitude = lon,
start_YYYYMMDD = date_min,
end_YYYYMMDD = date_max
)
# cut out the previous download and add in the next
# change type to allow for merging
powr_dl['Date'] = powr_dl['Date'].astype(int)
wthr_slice = wthr.loc[wthr_mask, ['Env', 'Year', 'Date']].merge(powr_dl)
wthr = wthr.loc[~wthr_mask, :].merge(wthr_slice, how = 'outer')
wthr = wthr.drop(columns = ['Latitude', 'Longitude'])
wthr.to_csv('./data/Preparation/wthr_powr_imp.csv', index=False)
Reading Weather from Preparation/
# check if clean weather exists, load if it does, download and fill in if not
if 'wthr_powr_imp_knn.csv' in os.listdir('./data/Preparation/'):
wthr_knn = pd.read_csv('./data/Preparation/wthr_powr_imp_knn.csv')
else:
# For values that are missing in POWER (-999), remove and then knn impute them
for col in [e for e in list(wthr) if sum((wthr[e] == -999)) > 0 ]:
mask = (wthr[col] == -999)
wthr.loc[mask, col] = np.nan
imputer = KNNImputer(n_neighbors=20)
knn_imputed = pd.DataFrame(
imputer.fit_transform(
wthr.drop(columns = ['Env', 'Year', 'Date'])))
knn_imputed.columns = [e for e in list(wthr) if e not in ['Env', 'Year', 'Date']]
wthr_knn = pd.concat([wthr.loc[:, ['Env', 'Year', 'Date']], knn_imputed], axis = 1)
wthr_knn.to_csv('./data/Preparation/wthr_powr_imp_knn.csv', index=False)
#clip to last doy
wthr = wthr_knn
temp = wthr.loc[:, ['Year', 'Date']].drop_duplicates()
temp['Date_Str'] = temp['Date'].astype(str)
temp['DOY'] = [pd.Period(e, freq='D').day_of_year for e in list(temp['Date'])]
# 2022 is the constraint. It only goes up to day 314.
clip_doy = np.max(temp.loc[(temp.Year == 2022), 'DOY'])
temp = temp.loc[(temp.DOY <= clip_doy), ]
temp = temp.drop(columns = ['Date_Str'])
wthr = wthr.merge(temp)
# same problem as above. Unclear why.
# assert False not in (summarize_col_missing(wthr).loc[:, 'Pr_Comp'] == 100)
assert False not in [True if e == 100 else False for e in list(summarize_col_missing(wthr).loc[:, 'Pr_Comp'])]
print("No missing values in `wthr`")
No missing values in `wthr`
log_imputed_envs(
df = cgmv,
df_name = 'cgmv',
col = 'SDR_pGerEme_1'
)
mask = cgmv.SDR_pGerEme_1.isna()
imp_Envs = cgmv.loc[mask, ['Env', ]].drop_duplicates()
imp_Envs = imp_Envs.reset_index().drop(columns = 'index')
gps_lookup = meta.loc[:, ['Env', 'Latitude_of_Field', 'Longitude_of_Field']].drop_duplicates()
# get rid of sites that need to be imputed in cgmv so the closest value is good to go.
antimask = gps_lookup.Env.isin(imp_Envs['Env'])
gps_search = gps_lookup.loc[antimask, ]
gps_lookup = gps_lookup.loc[~antimask, ]
gps_lookup['Distance'] = np.nan
def temp_find_closes(Env):
mask = (gps_search.Env == Env)
lat = list(gps_search.loc[mask, ]['Latitude_of_Field'])[0]
lon = list(gps_search.loc[mask, ]['Longitude_of_Field'])[0]
gps_lookup['Distance'] = np.sqrt( ((gps_lookup['Latitude_of_Field'] - float(lat))**2
) + ((gps_lookup['Longitude_of_Field'] - float(lon))**2))
out = gps_lookup.loc[(gps_lookup.Distance == min(gps_lookup.Distance)), 'Env']
return(list(out))
imp_with_Envs = [temp_find_closes(e) for e in list(imp_Envs.Env)]
# this is very sloppy but it's effective and fast to write
# for all the envs that need to be matched, get the closest Env(s) then loop
# over the cols
for i in tqdm.tqdm(range(len(list(imp_Envs.Env)))):
fillin = list(imp_Envs.Env)[i]
fillinwith = imp_with_Envs[i]
mask_fillin = cgmv.Env == fillin
mask_fillinwith = cgmv.Env.isin(fillinwith)
for col in [e for e in list(cgmv) if e not in ['Env', 'Year']]:
fillin_value = np.nanmean(cgmv.loc[mask_fillinwith, col])
cgmv.loc[mask_fillin, col] = fillin_value
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:08<00:00, 6.14it/s]
summarize_col_missing(cgmv)
| Col | N_miss | Pr_Comp | |
|---|---|---|---|
| 0 | Env | 0 | 100.0 |
| 1 | Year | 0 | 100.0 |
| 2 | SDR_pGerEme_1 | 0 | 100.0 |
| 3 | SDR_pEmeEnJ_1 | 0 | 100.0 |
| 4 | SDR_pEnJFlo_1 | 0 | 100.0 |
| ... | ... | ... | ... |
| 762 | AccumulatedTT_pFlaFlw | 0 | 100.0 |
| 763 | AccumulatedTT_pFlwStG | 0 | 100.0 |
| 764 | AccumulatedTT_pStGEnG | 0 | 100.0 |
| 765 | AccumulatedTT_pEnGMat | 0 | 100.0 |
| 766 | AccumulatedTT_pMatHar | 0 | 100.0 |
767 rows × 3 columns
# Test out the workflow in miniature by imputing Planting date
mask = (testing_years.Random_Order == 66)
# Test_HPS, Test_Model, Test_Ensemble =
holdout_years = list(
testing_years.loc[mask, [
'Test_HPS',
'Test_Model',
'Test_Ensemble']].reset_index().drop(columns = 'index').loc[0, :])
holdout_years = holdout_years + [2022]
holdout_years = [str(e) for e in holdout_years]
# 1. Hyperparameters =====
hps_hold = holdout_years[-4:]
hps_test = holdout_years[0:1]
# 2. Model =====
mod_hold = holdout_years[-3:]
mod_test = holdout_years[1:2]
# 3. Ensemble =====
ens_hold = holdout_years[-2:]
ens_test = holdout_years[2:3]
# 4. Submission =====
sub_hold = holdout_years[-1:]
sub_test = holdout_years[3:4]
# Making the input data
class df_prep():
def __init__(self):
self.train = {
"set":None,
"x":None,
"y":None,
"yna":None
}
self.test = {
"set":None,
"x":None,
"y":None,
"yna":None
}
self.cs_dict = None
self.isolate_missing_y_run = False # This is just for a guard rail in mk_scale_dict
# set up the dfs for the y var
def get_train_test_Envs(
self,
df = meta,
holdout_years = ['2020', '2014', '2016', '2022'],
test_year = ['2020']
):
mask = df.Year.isin(holdout_years+test_year)
train_set = df.loc[~mask, ['Env', 'Year']].drop_duplicates()
mask = df.Year.isin(test_year)
test_set = df.loc[mask, ['Env', 'Year']].drop_duplicates()
self.train['set'] = train_set
self.test['set'] = test_set
## Retrieve data based on Envs in test/train sets ==========================
# add in y variable
def _mk_ys_df(
self,
df_envs, #= train_HPS, # self. ['set']
df_data = meta,
add_cols = ['Date_Planted']
):
df_data = df_data.loc[:, ['Env']+add_cols]
df_out = df_envs.merge(df_data, 'left').drop_duplicates()
return(df_out)
def add_ys(
self,
#df_envs = train_HPS,
df_data = meta,
add_cols = ['Date_Planted']
):
self.train['y'] = self._mk_ys_df(
df_envs = self.train['set'],
df_data = df_data,
add_cols = add_cols)
self.test['y'] = self._mk_ys_df(
df_envs = self.test['set'],
df_data = df_data,
add_cols = add_cols)
# add in x variables
def _mk_xs_df(
self,
df_envs, #= train_HPS,
df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY']
):
df_data = df_data.drop(columns = [e for e in list(df_data) if e in drop_cols])
df_out = df_envs.merge(df_data, 'left').drop_duplicates()
df_out = df_out.drop(columns = [e for e in list(df_out) if e in drop_cols])
return(df_out)
def add_xs(
self,
#df_envs = train_HPS,
df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY']
):
self.train['x'] = self._mk_xs_df(
df_envs = self.train['set'],
df_data = df_data,
drop_cols = drop_cols)
self.test['x'] = self._mk_xs_df(
df_envs = self.test['set'],
df_data = df_data,
drop_cols = drop_cols)
## Separate out missing responses ==========================================
def isolate_missing_y(self):
if self.train['y'] is not None:
mask = (self.train['y'].isnull().any(axis = 1))
self.train['yna'] = self.train['y'].loc[mask, :].copy()
self.train['y'] = self.train['y'].loc[~mask, :].copy()
if self.test['y'] is not None:
mask = (self.test['y'].isnull().any(axis = 1))
self.test['yna'] = self.test['y'].loc[mask, :].copy()
self.test['y'] = self.test['y'].loc[~mask, :].copy()
self.isolate_missing_y_run = True # This is just for a guard rail in mk_scale_dict
def prep_idx_y(self):
if self.train['y'] is not None:
self.train['y'] = self.train['y'].reset_index().drop(columns = 'index')
if self.train['yna'] is not None:
self.train['yna'] = self.train['yna'].reset_index().drop(columns = 'index')
if self.test['y'] is not None:
self.test['y'] = self.test['y'].reset_index().drop(columns = 'index')
if self.test['yna'] is not None:
self.test['yna'] = self.test['yna'].reset_index().drop(columns = 'index')
## Center & Scaling dict ==================================================
# This looks odd (why not only have one method?) but is intentional. The idea here is that one
# might want to provide a saved center and scaling dictionary or include custom scaling for some
# columns. By including an update method making the default to return the dictionary instead of
# updating it silently it's easier to access and makes the step more visible.
def update_cs_dict(self, cs_dict):
self.cs_dict = cs_dict
def mk_scale_dict(self,
scale_cols = ['Date_Planted'],
return_cs_dict = True,
store_cs_dict = False):
# scale df
if not self.isolate_missing_y_run:
print("Warning: if run before isolate_missing_y all observations will be used.")
temp = self.train['y'].merge(self.train['x'], how = 'left')
cs_dict = {}
for e in scale_cols:
cs_dict.update({e : {'mean': np.mean(temp[e]),
'std' : np.std(temp[e])}})
if store_cs_dict:
if self.cs_dict is not None:
print('Overwriting Center and Scaling Dict.')
self.update_cs_dict(cs_dict)
if return_cs_dict:
return(cs_dict)
## Scaling / reverse scaling by dict =======================================
def _scale_by_dict(self, df):
scale_cols = [e for e in list(self.cs_dict.keys()) if e in list(df)]
for e in scale_cols:
df[e] = (df[e] - self.cs_dict[e]['mean'])/self.cs_dict[e]['std']
return(df)
def _unscale_by_dict(self, df):
scale_cols = [e for e in list(self.cs_dict.keys()) if e in list(df)]
for e in scale_cols:
df[e] = (df[e]*self.cs_dict[e]['std'])+self.cs_dict[e]['mean']
return(df)
def apply_scaling(self):
if self.train[ 'y'] is not None:
self.train['y'] = self._scale_by_dict(self.train['y'])
if self.train[ 'yna'] is not None:
self.train['yna'] = self._scale_by_dict(self.train['yna'])
if self.train[ 'x'] is not None:
self.train['x'] = self._scale_by_dict(self.train['x'])
if self.test[ 'y'] is not None:
self.test['y'] = self._scale_by_dict(self.test['y'])
if self.test[ 'yna'] is not None:
self.test['yna'] = self._scale_by_dict(self.test['yna'])
if self.test[ 'x'] is not None:
self.test['x'] = self._scale_by_dict(self.test['x'])
def reverse_scaling(self):
if self.train[ 'y'] is not None:
self.train['y'] = self._unscale_by_dict(self.train['y'])
if self.train[ 'yna'] is not None:
self.train['yna'] = self._unscale_by_dict(self.train['yna'])
if self.train[ 'x'] is not None:
self.train['x'] = self._unscale_by_dict(self.train['x'])
if self.test[ 'y'] is not None:
self.test['y'] = self._unscale_by_dict(self.test['y'])
if self.test[ 'yna'] is not None:
self.test['yna'] = self._unscale_by_dict(self.test['yna'])
if self.test[ 'x'] is not None:
self.test['x'] = self._unscale_by_dict(self.test['x'])
## numpy arrays to easily be converted to tensors ==========================
def mk_arrays(
self,
split = 'train',
obs_per_Env = 1,
return_2d = True,
missing_ys = False):
# using the envs specified in the y data frame and the covariates in the x data frame, return numpy arrays
# with the xs and ys accessible by the same idx
def _reformat_xy(y_df,
x_df,
obs_per_Env,
return_2d = False):
ys_tensor = np.array(y_df.loc[:, [e for e in y_df if e not in ['Env', 'Year']]])
# Don't seem to need to manually set second dim (to 1 from none) do this when data is drawn from df asabove
# ys_tensor = ys_tensor.reshape((ys_df.shape[0], 1))
col_names = [e for e in list(x_df) if e not in ['Env']]
num_obs = ys_tensor.shape[0] # observations
# obs_per_Env # user input, 1 in most cases, 314 for weather
col_per_obs = len(col_names)
xs_tensor = np.zeros(shape = (num_obs,
obs_per_Env,
col_per_obs))
for i in y_df.index:
mask = (x_df.Env == y_df.loc[i, 'Env'])
xs_tensor[i, :, :] = np.array(x_df.loc[mask, col_names
].drop_duplicates() )
# Note! this will result in weather data being in order of
# N, Length (days), Channels. To match pytorch conventions
# I swap these below
# for non weather data
if return_2d:
new_dim_0 = xs_tensor.shape[0]
new_dim_1 = xs_tensor.shape[2]
xs_tensor = xs_tensor.reshape(new_dim_0, new_dim_1)
else:
# swap axes so that weather is in order of
# N, Channels, Length
xs_tensor = xs_tensor.swapaxes(1,2)
return([ys_tensor, xs_tensor])
if split not in ['train', 'test']:
print('`split`must be "train" or "test"')
else:
if split == 'train':
if not missing_ys:
out = _reformat_xy(
y_df = self.train['y'],
x_df = self.train['x'],
obs_per_Env = obs_per_Env,
return_2d = return_2d)
else:
out = _reformat_xy(
y_df = self.train['yna'],
x_df = self.train['x'],
obs_per_Env = obs_per_Env,
return_2d = return_2d)
elif split == 'test':
if not missing_ys:
out = _reformat_xy(
y_df = self.test['y'],
x_df = self.test['x'],
obs_per_Env = obs_per_Env,
return_2d = return_2d)
else:
out = _reformat_xy(
y_df = self.test['yna'],
x_df = self.test['x'],
obs_per_Env = obs_per_Env,
return_2d = return_2d)
return(out)
# Convert dates to doy so they're easier to work with
def df_date_to_datetime(df, cols):
temp = df.copy()
for col in cols:
temp[col] = [pd.Period(e, freq='D').day_of_year for e in list(temp[col])]
return(temp)
# Demo 3d data (wthr)
if True == False:
demo = df_prep()
demo.get_train_test_Envs(
df = meta,
holdout_years = ['2020', '2014', '2016', '2022'],
test_year = ['2020'] )
demo.add_ys(
df_data = df_date_to_datetime(df = meta,
cols = ['Date_Planted']),
add_cols = ['Date_Planted'])
demo.add_xs(
df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
demo.isolate_missing_y()
demo.prep_idx_y()
demo.mk_scale_dict(
scale_cols = ['Date_Planted']+[e for e in list(wthr) if e not in ['Env', 'Year', 'Date', 'DOY']],
return_cs_dict = False,
store_cs_dict = True)
demo.apply_scaling()
# demo.reverse_scaling()
train_y, train_x = demo.mk_arrays(
split = 'train',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
test_y, test_x = demo.mk_arrays(
split = 'test',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
print([e.shape for e in [train_y, train_x, test_y, test_x]])
# Demo 2d data (soil)
if True == False:
demo = df_prep()
demo.get_train_test_Envs(
df = meta,
holdout_years = ['2020', '2014', '2016', '2022'],
test_year = ['2020'] )
demo.add_ys(
df_data = df_date_to_datetime(df = meta,
cols = ['Date_Planted']),
add_cols = ['Date_Planted'])
demo.add_xs(
df_data = soil,
drop_cols = ['Year', 'Date', 'DOY'])
demo.isolate_missing_y()
demo.prep_idx_y()
demo.mk_scale_dict(
scale_cols = ['Date_Planted']+[e for e in list(soil) if e not in ['Env', 'Year', 'Date', 'DOY']],
return_cs_dict = False,
store_cs_dict = True)
demo.apply_scaling()
# demo.reverse_scaling()
train_y, train_x = demo.mk_arrays(
split = 'train',
obs_per_Env = 1,
return_2d = True,
missing_ys = False)
test_y, test_x = demo.mk_arrays(
split = 'test',
obs_per_Env = 1,
return_2d = True,
missing_ys = False)
print([e.shape for e in [train_y, train_x, test_y, test_x]])
# Making the input data
class CustomDataset_wthr(Dataset):
def __init__(self, y, x,
transform=None, # can pass in ToTensor()
target_transform=None):
self.y = y
self.x = x
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
x_idx = self.x[idx]
y_idx = self.y[idx]
if self.transform:
x_idx = self.transform(x_idx)
if self.target_transform:
y_idx = self.target_transform(y_idx)
return x_idx, y_idx
def train_loop(dataloader, model, loss_fn, optimizer, silent = True):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
if not silent:
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn, silent = True):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
# correct /= size
if not silent:
print(f"Test Error: Avg loss: {test_loss:>8f}")
return(test_loss) #new
# # Demo 3d data (wthr)
# demo = df_prep()
# demo.get_train_test_Envs(df = meta,
# holdout_years = ['2020', '2014', '2016', '2022'],
# test_year = ['2020'] )
# demo.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Planted']),
# add_cols = ['Date_Planted'])
# demo.add_xs(df_data = wthr,
# drop_cols = ['Year', 'Date', 'DOY'])
# demo.isolate_missing_y()
# demo.prep_idx_y()
# demo.mk_scale_dict(
# scale_cols = ['Date_Planted']+[e for e in list(wthr) if e not in ['Env', 'Year', 'Date', 'DOY']],
# return_cs_dict = False,
# store_cs_dict = True)
# demo.apply_scaling()
# train_y, train_x = demo.mk_arrays(
# split = 'train',
# obs_per_Env = 314,
# return_2d = False,
# missing_ys = False)
# test_y, test_x = demo.mk_arrays(
# split = 'test',
# obs_per_Env = 314,
# return_2d = False,
# missing_ys = False)
# [e.shape for e in [train_y, train_x, test_y, test_x]]
# train_y_tensor = torch.from_numpy(train_y).to(device).float()
# train_x_tensor = torch.from_numpy(train_x).to(device).float()
# test_y_tensor = torch.from_numpy(test_y).to(device).float()
# test_x_tensor = torch.from_numpy(test_x).to(device).float()
# training_dataloader = DataLoader(CustomDataset_wthr(y = train_y_tensor, x = train_x_tensor), batch_size = 64, shuffle = True)
# testing_dataloader = DataLoader(CustomDataset_wthr(y = test_y_tensor, x = test_x_tensor), batch_size = 64, shuffle = True)
# class NeuralNetwork(nn.Module):
# def __init__(self):
# super(NeuralNetwork, self).__init__()
# in_size = 314 * 16
# n1 = 16
# n2 = 16
# self.linear_relu_stack = nn.Sequential(
# nn.Flatten(),
# nn.Linear(in_size, n1),
# nn.ReLU(),
# nn.Linear(n1, n2),
# nn.ReLU(),
# nn.Linear(n2, 1)
# )
# def forward(self, x):
# # x = self.flatten(x)
# logits = self.linear_relu_stack(x)
# return logits
# model = NeuralNetwork().to(device)
# # print(model)
# # print(model(torch.rand(1, device=device))) # shows that it works
# # model(next(iter(training_dataloader))[0] ) # try prediction on one batch
# # learning_rate = 1e-3
# # batch_size = 64
# # epochs = 500
# #
# # # Initialize the loss function
# # loss_fn = nn.MSELoss()
# # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# #
# # loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
# # loss_df['MSE'] = np.nan
# #
# # for t in tqdm.tqdm(range(epochs)):
# # # print(f"Epoch {t+1}\n-------------------------------")
# # train_loop(training_dataloader, model, loss_fn, optimizer)
# #
# # loss_df.loc[loss_df.index == t, 'MSE'
# # ] = test_loop(testing_dataloader, model, loss_fn)
# #
# # print("Done!")
# def train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# ):
# # Initialize the loss function
# loss_fn = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
# loss_df['MSE'] = np.nan
# for t in tqdm.tqdm(range(epochs)):
# # print(f"Epoch {t+1}\n-------------------------------")
# train_loop(training_dataloader, model, loss_fn, optimizer)
# loss_df.loc[loss_df.index == t, 'MSE'
# ] = test_loop(testing_dataloader, model, loss_fn)
# # print("Done!")
# return([model, loss_df])
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# )
# yhats = model(test_x_tensor)
# yhats = yhats.cpu().detach().numpy()
# yobs = test_y_tensor
# yobs = yobs.cpu().detach().numpy()
# plt_df = pd.concat([
# pd.DataFrame(yhats, columns = ['yHat']),
# pd.DataFrame(yobs, columns = ['yObs'])],
# axis=1)
# px.line(loss_df, x = 'Epoch', y = 'MSE')
# px.scatter(plt_df, x = 'yObs', y = 'yHat')
# mean_squared_error(plt_df['yObs'], plt_df['yHat'], squared=False)
test_this_year = '2021'
# Setup ----------------------------------------------------------------------
trial_name = 'DNN_hps_test'
learning_rate = 1e-3
batch_size = 64
epochs = 500
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
in_size = 314 * 16
n1 = 2**10
n2 = 2**8
n3 = 2**6
n4 = 2**4
n5 = 2
self.linear_relu_stack = nn.Sequential(
nn.Flatten(),
nn.Linear(in_size, n1),
nn.ReLU(),
nn.Linear(n1, n2),
nn.ReLU(),
nn.Linear(n2, n3),
nn.ReLU(),
nn.Linear(n3, n4),
nn.ReLU(),
nn.Linear(n4, n5),
nn.ReLU(),
nn.Linear(n5, 1)
)
def forward(self, x):
# x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def run_dnn_trial(
trial_name = 'DNN_hps_test',
learning_rate = 1e-3,
batch_size = 64,
epochs = 500):
reset_trial_name = trial_name
for test_this_year in ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']:
trial_name = reset_trial_name
trial_name = trial_name+test_this_year
print(test_this_year)
# Data Prep. -----------------------------------------------------------------
data_obj = df_prep()
data_obj.get_train_test_Envs(df = meta,
holdout_years = [],
test_year = [test_this_year] )
data_obj.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Planted']),
add_cols = ['Date_Planted'])
data_obj.add_xs(df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
data_obj.isolate_missing_y()
data_obj.prep_idx_y()
data_obj.mk_scale_dict(
scale_cols = ['Date_Planted']+[e for e in list(wthr) if e not in ['Env', 'Year', 'Date', 'DOY']],
return_cs_dict = False,
store_cs_dict = True)
data_obj.apply_scaling()
train_y, train_x = data_obj.mk_arrays(
split = 'train',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
test_y, test_x = data_obj.mk_arrays(
split = 'test',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
train_y_tensor = torch.from_numpy(train_y).to(device).float()
train_x_tensor = torch.from_numpy(train_x).to(device).float()
test_y_tensor = torch.from_numpy(test_y).to(device).float()
test_x_tensor = torch.from_numpy(test_x).to(device).float()
training_dataloader = DataLoader(CustomDataset_wthr(y = train_y_tensor, x = train_x_tensor), batch_size = 64, shuffle = True)
testing_dataloader = DataLoader(CustomDataset_wthr(y = test_y_tensor, x = test_x_tensor), batch_size = 64, shuffle = True)
# Fit Mod --------------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_mod.pth'
if os.path.exists(cache_save_name):
model = torch.load(cache_save_name)
model = NeuralNetwork().to(device)
else:
model = NeuralNetwork().to(device)
def train_nn(
training_dataloader,
testing_dataloader,
model,
learning_rate = 1e-3,
batch_size = 64,
epochs = 500
):
# Initialize the loss function
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
loss_df['MSE'] = np.nan
for t in tqdm.tqdm(range(epochs)):
# print(f"Epoch {t+1}\n-------------------------------")
train_loop(training_dataloader, model, loss_fn, optimizer)
loss_df.loc[loss_df.index == t, 'MSE'
] = test_loop(testing_dataloader, model, loss_fn)
return([model, loss_df])
model, loss_df = train_nn(
training_dataloader,
testing_dataloader,
model,
learning_rate = learning_rate,
batch_size = batch_size,
epochs = epochs
)
# Save
torch.save(model, cache_save_name)
loss_df.to_csv(cache_path+trial_name+'_loss.csv', index = False)
# Eval. Best HPS -------------------------------------------------------------
for i in range(4):
# u denotes that the true value is unknown
temp_label= ['train_y', 'test_y', 'train_u', 'test_u'][i]
if os.path.exists(cache_path+trial_name+temp_label+".csv"):
pass
else:
temp_data = [data_obj.train['y'], data_obj.test['y'], data_obj.train['yna'], data_obj.test['yna']][i]
if temp_data.shape[0] != 0:
# calculation step below =============================================
temp_y, temp_x = data_obj.mk_arrays( #
split = temp_label.split('_')[0], #
obs_per_Env = 314, #
return_2d = False, #
missing_ys = temp_label.split('_')[1] == 'u' ) #
#
temp_yHat = model(torch.from_numpy(temp_x).to(device).float()) #
temp_yHat = temp_yHat.cpu().detach().numpy() #
temp_yHat.reshape(temp_yHat.shape[0], ) # collapse to 1d #
# calculation step above =============================================
temp_data['yHat'] = [e[0] for e in list(temp_yHat)]
temp_data.to_csv(cache_path+trial_name+'_'+temp_label+".csv", index = False)
if temp_label.split('_')[1] != 'u':
pd.DataFrame({'MSE':[mean_squared_error(temp_y, temp_yHat)]}).to_csv(cache_path+trial_name+'_'+temp_label+"_mse.csv", index = False)
if False:
run_dnn_trial(
trial_name = 'DNN_hps_test',
learning_rate = 1e-3,
batch_size = 64,
epochs = 500)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
in_size = 314 * 16
n1 = 2**10
n4 = 2**4
n5 = 2
self.linear_relu_stack = nn.Sequential(
nn.Flatten(),
nn.Linear(in_size, n1),
nn.Linear(n1, n4),
nn.ReLU(),
nn.Linear(n4, n5),
nn.ReLU(),
nn.Linear(n5, 1)
)
def forward(self, x):
# x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
if False:
run_dnn_trial(
trial_name = 'DNNsmall_hps_test',
learning_rate = 1e-3,
batch_size = 64,
epochs = 500)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.seq_model = nn.Sequential(
nn.Conv1d(16, 32, kernel_size = 3, stride=2),
nn.ReLU(),
nn.Conv1d(32, 16, kernel_size = 3, stride=2),
nn.ReLU(),
nn.Conv1d(16, 8, kernel_size = 3, stride=2),
nn.ReLU(),
nn.Conv1d(8, 4, kernel_size = 3, stride=2),
nn.ReLU(),
nn.Conv1d(4, 2, kernel_size = 3, stride=2),
nn.ReLU(),
nn.Flatten(),
nn.ReLU(),
nn.Linear(16, 1)
)
def forward(self, x):
# x = self.flatten(x)
logits = self.seq_model(x)
return logits
if False:
run_dnn_trial(
trial_name = 'CNNv1_hps_test',
learning_rate = 1e-3,
batch_size = 64,
epochs = 500)
# code to help with checking dims
# test_x_ten = torch.from_numpy(test_x).float()
# test_x_ten.shape
# # Out shape
# # channels |
# # | |
# m = nn.Sequential(
# nn.Conv1d(16, 32, kernel_size = 3, stride=2),
# nn.ReLU(),
# nn.Conv1d(32, 16, kernel_size = 3, stride=2),
# nn.ReLU(),
# nn.Conv1d(16, 8, kernel_size = 3, stride=2),
# nn.ReLU(),
# nn.Conv1d(8, 4, kernel_size = 3, stride=2),
# nn.ReLU(),
# nn.Conv1d(4, 2, kernel_size = 3, stride=2),
# nn.ReLU(),
# nn.Flatten(),
# nn.ReLU(),
# nn.Linear(16, 1)
# )
# m(test_x_ten).shape
loss_files = [e for e in os.listdir(cache_path) if re.match('.+_loss.csv$', e)]
loss_dfs = [pd.read_csv(cache_path+e) for e in loss_files]
for i in range(len(loss_files)):
loss_dfs[i]['File'] = loss_files[i]
loss_df = pd.concat(loss_dfs)
loss_df[['Model', 'Stage', 'CV', 'Record']] = loss_df['File'].str.split('_', expand = True)
loss_df['CV'] = loss_df['CV'].str.strip('test')
px.line(loss_df, x = 'Epoch', y = 'MSE', color = 'CV', facet_col="Model")
# transform to panel data
def wthr_rank_3to2(x_3d):
n_obs, n_days, n_metrics = x_3d.shape
return(x_3d.reshape(n_obs, (n_days*n_metrics)))
def y_rank_2to1(y_2d):
n_obs = y_2d.shape[0]
return(y_2d.reshape(n_obs, ))
# train_x_2d = wthr_rank_3to2(x_3d = train_x)
# train_y_1d = y_rank_2to1(y_2d = train_y)
# regr = RandomForestRegressor(max_depth= 16,
# random_state=0,
# n_estimators = 20)
# rf = regr.fit(train_x_2d, train_y_1d)
# mean_squared_error(train_y_1d, rf.predict(train_x_2d), squared=False)
# px.bar(pd.DataFrame(dict(cols=trn_xs.columns, imp=rf.feature_importances_)), x = 'cols', y = 'imp')
test_this_year = '2021'
# Setup ----------------------------------------------------------------------
trial_name = 'rf_hps_test'
n_trials= 200
n_jobs = 20
def objective(trial):
rf_max_depth = trial.suggest_int('rf_max_depth', 2, 200, log=True)
rf_n_estimators = trial.suggest_int('rf_n_estimators', 2, 200, log=True)
rf_min_samples_split = trial.suggest_float('rf_min_samples_split', 0.01, 0.99, log=True)
regr = RandomForestRegressor(
max_depth = rf_max_depth,
n_estimators = rf_n_estimators,
min_samples_split = rf_min_samples_split
)
rf = regr.fit(train_x_2d, train_y_1d)
return (mean_squared_error(train_y_1d, rf.predict(train_x_2d), squared=False))
reset_trial_name = trial_name
if False:
for test_this_year in ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']:
trial_name = reset_trial_name
trial_name = trial_name+test_this_year
print(test_this_year)
# Data Prep. -----------------------------------------------------------------
data_obj = df_prep()
data_obj.get_train_test_Envs(df = meta,
holdout_years = [],
test_year = [test_this_year] )
data_obj.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Planted']),
add_cols = ['Date_Planted'])
data_obj.add_xs(df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
data_obj.isolate_missing_y()
data_obj.prep_idx_y()
data_obj.mk_scale_dict(
scale_cols = ['Date_Planted']+[e for e in list(wthr) if e not in ['Env', 'Year', 'Date', 'DOY']],
return_cs_dict = False,
store_cs_dict = True)
data_obj.apply_scaling()
train_y, train_x = data_obj.mk_arrays(
split = 'train',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
test_y, test_x = data_obj.mk_arrays(
split = 'test',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
train_x_2d = wthr_rank_3to2(x_3d = train_x)
train_y_1d = y_rank_2to1(y_2d = train_y)
# HPS Study ------------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_hps.pkl'
if os.path.exists(cache_save_name):
study = pkl.load(open(cache_save_name, 'rb'))
else:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials= n_trials, n_jobs = n_jobs)
# save
pkl.dump(study, open(cache_save_name, 'wb'))
# Fit Best HPS ---------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_mod.pkl'
if os.path.exists(cache_save_name):
rf = pkl.load(open(cache_save_name, 'rb'))
else:
regr = RandomForestRegressor(
max_depth = study.best_trial.params['rf_max_depth'],
n_estimators = study.best_trial.params['rf_n_estimators'],
min_samples_split = study.best_trial.params['rf_min_samples_split']
)
rf = regr.fit(train_x_2d, train_y_1d)
# save
pkl.dump(rf, open(cache_save_name, 'wb'))
# Eval. Best HPS -------------------------------------------------------------
for i in range(4):
# u denotes that the true value is unknown
temp_label= ['train_y', 'test_y', 'train_u', 'test_u'][i]
if os.path.exists(cache_path+trial_name+temp_label+".csv"):
pass
else:
temp_data = [data_obj.train['y'], data_obj.test['y'], data_obj.train['yna'], data_obj.test['yna']][i]
if temp_data.shape[0] != 0:
# calculation step below =============================================
temp_y, temp_x = data_obj.mk_arrays( #
split = temp_label.split('_')[0], #
obs_per_Env = 314, #
return_2d = False, #
missing_ys = temp_label.split('_')[1] == 'u' ) #
#
temp_y = y_rank_2to1(y_2d = temp_y) #
temp_yHat = rf.predict(wthr_rank_3to2(x_3d = temp_x)) #
# calculation step above =============================================
temp_data['yHat'] = list(temp_yHat)
temp_data.to_csv(cache_path+trial_name+'_'+temp_label+".csv", index = False)
if temp_label.split('_')[1] != 'u':
pd.DataFrame({'MSE':[mean_squared_error(temp_y, temp_yHat)]}).to_csv(cache_path+trial_name+'_'+temp_label+"_mse.csv", index = False)
from xgboost import XGBRegressor
test_this_year = '2021'
# Setup ----------------------------------------------------------------------
trial_name = 'xgb_hps_test'
n_trials= 200 # FIXME
n_jobs = 20
def objective(trial):
xgb_max_depth = trial.suggest_int('xgb_max_depth', 2, 200, log=True)
xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 2, 200, log=True)
xgb_learning_rate = trial.suggest_float('xgb_learning_rate', 0.0001, 0.3, log=True)
regr = XGBRegressor(
max_depth = xgb_max_depth,
n_estimators = xgb_n_estimators,
learning_rate = xgb_learning_rate,
objective='reg:squarederror'
)
xgb = regr.fit(train_x_2d, train_y_1d)
return (mean_squared_error(train_y_1d, xgb.predict(train_x_2d), squared=False))
reset_trial_name = trial_name
if False:
for test_this_year in ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']:
trial_name = reset_trial_name
trial_name = trial_name+test_this_year
print(test_this_year)
# Data Prep. -----------------------------------------------------------------
data_obj = df_prep()
data_obj.get_train_test_Envs(df = meta,
holdout_years = [],
test_year = [test_this_year] )
data_obj.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Planted']),
add_cols = ['Date_Planted'])
data_obj.add_xs(df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
data_obj.isolate_missing_y()
data_obj.prep_idx_y()
data_obj.mk_scale_dict(
scale_cols = ['Date_Planted']+[e for e in list(wthr) if e not in ['Env', 'Year', 'Date', 'DOY']],
return_cs_dict = False,
store_cs_dict = True)
data_obj.apply_scaling()
train_y, train_x = data_obj.mk_arrays(
split = 'train',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
test_y, test_x = data_obj.mk_arrays(
split = 'test',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
train_x_2d = wthr_rank_3to2(x_3d = train_x)
train_y_1d = y_rank_2to1(y_2d = train_y)
# HPS Study ------------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_hps.pkl'
if os.path.exists(cache_save_name):
study = pkl.load(open(cache_save_name, 'rb'))
else:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials= n_trials, n_jobs = n_jobs)
# save
pkl.dump(study, open(cache_save_name, 'wb'))
# Fit Best HPS ---------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_mod.pkl'
if os.path.exists(cache_save_name):
xgb = pkl.load(open(cache_save_name, 'rb'))
else:
regr = XGBRegressor(
max_depth = study.best_trial.params['xgb_max_depth'],
n_estimators = study.best_trial.params['xgb_n_estimators'],
learning_rate = study.best_trial.params['xgb_learning_rate'],
objective='reg:squarederror'
)
xgb = regr.fit(train_x_2d, train_y_1d)
# save
pkl.dump(xgb, open(cache_save_name, 'wb'))
# Eval. Best HPS -------------------------------------------------------------
for i in range(4):
# u denotes that the true value is unknown
temp_label= ['train_y', 'test_y', 'train_u', 'test_u'][i]
if os.path.exists(cache_path+trial_name+temp_label+".csv"):
pass
else:
temp_data = [data_obj.train['y'], data_obj.test['y'], data_obj.train['yna'], data_obj.test['yna']][i]
if temp_data.shape[0] != 0:
# calculation step below =============================================
temp_y, temp_x = data_obj.mk_arrays( #
split = temp_label.split('_')[0], #
obs_per_Env = 314, #
return_2d = False, #
missing_ys = temp_label.split('_')[1] == 'u' ) #
#
temp_y = y_rank_2to1(y_2d = temp_y) #
temp_yHat = xgb.predict(wthr_rank_3to2(x_3d = temp_x)) #
# calculation step above =============================================
temp_data['yHat'] = list(temp_yHat)
temp_data.to_csv(cache_path+trial_name+'_'+temp_label+".csv", index = False)
if temp_label.split('_')[1] != 'u':
pd.DataFrame({'MSE':[mean_squared_error(temp_y, temp_yHat)]}).to_csv(cache_path+trial_name+'_'+temp_label+"_mse.csv", index = False)
Test out inverse variance weighting
# Find all errors
mse_files = [e for e in os.listdir(cache_path) if re.match('.+_mse.csv$', e)]
mse_files = [e for e in mse_files if re.match('.+hps.+', e)]
mod_mses = [pd.read_csv(cache_path+e) for e in mse_files]
for i in range(len(mse_files)):
mod_mses[i]['File'] = mse_files[i]
mod_mse = pd.concat(mod_mses)
mod_mse[['Model', 'Stage', 'CV', 'Set', 'Discard1', 'Discard2']] = mod_mse['File'].str.split('_', expand = True)
mod_mse = mod_mse.drop(columns = ['Discard1', 'Discard2'])
mod_mse['CV'] = mod_mse['CV'].str.strip('test')
mod_mse.loc[mod_mse.Model == 'rf', ]
| MSE | File | Model | Stage | CV | Set | |
|---|---|---|---|---|---|---|
| 0 | 0.076830 | rf_hps_test2015_train_y_mse.csv | rf | hps | 2015 | train |
| 0 | 0.288083 | rf_hps_test2014_test_y_mse.csv | rf | hps | 2014 | test |
| 0 | 0.384775 | rf_hps_test2018_test_y_mse.csv | rf | hps | 2018 | test |
| 0 | 0.081612 | rf_hps_test2020_train_y_mse.csv | rf | hps | 2020 | train |
| 0 | 0.085405 | rf_hps_test2019_train_y_mse.csv | rf | hps | 2019 | train |
| 0 | 0.085814 | rf_hps_test2014_train_y_mse.csv | rf | hps | 2014 | train |
| 0 | 0.675280 | rf_hps_test2019_test_y_mse.csv | rf | hps | 2019 | test |
| 0 | 0.299519 | rf_hps_test2021_test_y_mse.csv | rf | hps | 2021 | test |
| 0 | 0.077572 | rf_hps_test2017_train_y_mse.csv | rf | hps | 2017 | train |
| 0 | 0.486941 | rf_hps_test2016_test_y_mse.csv | rf | hps | 2016 | test |
| 0 | 0.078880 | rf_hps_test2021_train_y_mse.csv | rf | hps | 2021 | train |
| 0 | 0.076775 | rf_hps_test2018_train_y_mse.csv | rf | hps | 2018 | train |
| 0 | 0.381846 | rf_hps_test2020_test_y_mse.csv | rf | hps | 2020 | test |
| 0 | 0.352149 | rf_hps_test2017_test_y_mse.csv | rf | hps | 2017 | test |
| 0 | 0.070698 | rf_hps_test2016_train_y_mse.csv | rf | hps | 2016 | train |
| 0 | 0.507988 | rf_hps_test2015_test_y_mse.csv | rf | hps | 2015 | test |
px.scatter(mod_mse, x = "Model", y = 'MSE', color = 'CV')
test_files = [e for e in os.listdir(cache_path) if re.match('.+_test_y.csv$', e)]
test_files = [e for e in test_files if re.match('.+hps.+', e)]
test_preds = [pd.read_csv(cache_path+e) for e in test_files]
for i in range(len(test_files)):
test_preds[i]['File'] = test_files[i]
test_pred = pd.concat(test_preds)
test_pred[['Model', 'Stage', 'CV', 'Set', 'Discard1']] = test_pred['File'].str.split('_', expand = True)
test_pred = test_pred.drop(columns = ['Discard1'])
test_pred['CV'] = test_pred['CV'].str.strip('test')
test_pred
| Env | Year | Date_Planted | yHat | File | Model | Stage | CV | Set | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | DEH1_2015 | 2015 | -0.320742 | 0.307332 | xgb_hps_test2015_test_y.csv | xgb | hps | 2015 | test |
| 1 | GAH1_2015 | 2015 | -1.630165 | -1.195790 | xgb_hps_test2015_test_y.csv | xgb | hps | 2015 | test |
| 2 | IAH1_2015 | 2015 | -0.180447 | 0.273933 | xgb_hps_test2015_test_y.csv | xgb | hps | 2015 | test |
| 3 | IAH3_2015 | 2015 | -0.273977 | 0.260193 | xgb_hps_test2015_test_y.csv | xgb | hps | 2015 | test |
| 4 | IAH4_2015 | 2015 | -0.227212 | 0.345641 | xgb_hps_test2015_test_y.csv | xgb | hps | 2015 | test |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 22 | NCH1_2015 | 2015 | -0.367508 | -0.549053 | DNNsmall_hps_test2015_test_y.csv | DNNsmall | hps | 2015 | test |
| 23 | TXH1_2015 | 2015 | -2.799293 | -0.511324 | DNNsmall_hps_test2015_test_y.csv | DNNsmall | hps | 2015 | test |
| 24 | NEH2_2015 | 2015 | -0.601333 | -0.509826 | DNNsmall_hps_test2015_test_y.csv | DNNsmall | hps | 2015 | test |
| 25 | NEH3_2015 | 2015 | 1.643392 | -0.484959 | DNNsmall_hps_test2015_test_y.csv | DNNsmall | hps | 2015 | test |
| 26 | NYH3_2015 | 2015 | 0.801620 | -0.536663 | DNNsmall_hps_test2015_test_y.csv | DNNsmall | hps | 2015 | test |
1125 rows × 9 columns
test_pred_wide = test_pred.pivot(columns='Model', values='yHat', index = ['Env', 'Year', 'CV', 'Date_Planted'])
test_pred_wide = test_pred_wide.reset_index()
test_pred_wide
| Model | Env | Year | CV | Date_Planted | CNNv1 | DNN | DNNsmall | rf | xgb |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ARH1_2016 | 2016 | 2016 | -1.316889 | -0.009841 | -0.263118 | 0.333260 | -1.005455 | -0.441560 |
| 1 | ARH1_2017 | 2017 | 2017 | -0.919991 | NaN | NaN | NaN | -0.930448 | -0.909342 |
| 2 | ARH1_2017 | 2017 | 2017 | -0.919991 | 0.188228 | -0.030083 | 0.590413 | NaN | NaN |
| 3 | ARH1_2018 | 2018 | 2018 | -0.545119 | -0.160902 | 0.402216 | -0.194347 | -1.004815 | -0.620110 |
| 4 | ARH2_2016 | 2016 | 2016 | -0.575404 | -0.012026 | -0.256440 | 0.373882 | -0.522308 | -0.272932 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 298 | WIH2_2019 | 2019 | 2019 | 1.402418 | 0.322846 | -0.382385 | -0.253964 | NaN | NaN |
| 299 | WIH2_2020 | 2020 | 2020 | 0.392131 | 0.070860 | -0.222047 | 0.501853 | 0.460185 | 0.336505 |
| 300 | WIH2_2021 | 2021 | 2021 | 0.795410 | -0.181260 | -0.203308 | 0.605871 | 0.412373 | 0.397376 |
| 301 | WIH3_2020 | 2020 | 2020 | 0.768499 | 0.072733 | -0.222036 | 0.501853 | 0.508769 | 0.338763 |
| 302 | WIH3_2021 | 2021 | 2021 | -0.378869 | -0.184203 | -0.205550 | 0.618002 | 0.542042 | 0.568585 |
303 rows × 9 columns
px.scatter_matrix(test_pred_wide, dimensions=['Date_Planted', 'CNNv1', 'DNN', 'DNNsmall', 'rf', 'xgb'],
color = 'CV')
/home/labmember/miniconda3/envs/g2f_comp/lib/python3.10/site-packages/plotly/express/_core.py:279: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
mod_list = [e for e in list(test_pred_wide) if e not in ['Env', 'Year', 'CV', 'Date_Planted']]
var_list = [(test_pred_wide['Date_Planted'] - test_pred_wide[e]).var() for e in mod_list]
wht_list = [e/np.sum(var_list) for e in var_list]
# Aggregate estimates using averaging and inverse variance weighting
for i in range(len(mod_list)):
if i == 0:
yHat_ave_accumulator = test_pred_wide[mod_list[i]]*(1/len(mod_list))
else:
yHat_ave_accumulator += test_pred_wide[mod_list[i]]*(1/len(mod_list))
for i in range(len(mod_list)):
if i == 0:
yHat_invVar_accumulator = test_pred_wide[mod_list[i]]*wht_list[i]
else:
yHat_invVar_accumulator += test_pred_wide[mod_list[i]]*wht_list[i]
test_pred_wide['mean'] = yHat_ave_accumulator
test_pred_wide['iVar'] = yHat_invVar_accumulator
test_pred_long = test_pred_wide.melt(id_vars=['Env', 'Year', 'CV', 'Date_Planted'])
test_pred_long['ErrorSq'] = (test_pred_long['Date_Planted'] - test_pred_long['value'])**2
test_pred_long = test_pred_long.groupby(['CV', 'Model']).agg(MSE = ('ErrorSq', np.mean)).reset_index()
test_pred_long.head()
| CV | Model | MSE | |
|---|---|---|---|
| 0 | 2014 | CNNv1 | 0.782146 |
| 1 | 2014 | DNN | 0.972487 |
| 2 | 2014 | DNNsmall | 0.774705 |
| 3 | 2014 | iVar | 0.340465 |
| 4 | 2014 | mean | 0.250416 |
px.box(test_pred_long, x = 'Model', y = 'MSE', color = 'Model')
# write out a log of the enviroments imputed
log_imputed_envs(
df = meta,
df_name = 'meta',
col = 'Date_Planted'
)
# get scaling to use on predictions
def temp_fcn(test_this_year):
demo = df_prep()
demo.get_train_test_Envs(df = meta,
holdout_years = [],
test_year = [test_this_year] )
demo.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Planted']),
add_cols = ['Date_Planted'])
demo.add_xs(df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
demo.isolate_missing_y()
demo.prep_idx_y()
out = demo.mk_scale_dict(
scale_cols = ['Date_Planted'],
return_cs_dict = True,
store_cs_dict = False)
return(out)
years_tested = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
yHat_scaling = [pd.DataFrame(temp_fcn(test_this_year)['Date_Planted'], index = [test_this_year]) for test_this_year in years_tested]
yHat_scaling = pd.concat(yHat_scaling).reset_index().rename(columns = {'index':'CV'})
yHat_scaling.head()
| CV | mean | std | |
|---|---|---|---|
| 0 | 2014 | 125.670000 | 21.571534 |
| 1 | 2015 | 125.858586 | 21.383469 |
| 2 | 2016 | 126.416244 | 21.578315 |
| 3 | 2017 | 126.659794 | 21.369545 |
| 4 | 2018 | 126.688776 | 21.442616 |
impute_with_mod = 'rf'
file_list = [e for e in os.listdir(cache_path) if re.match('^'+impute_with_mod+'.+.csv$', e) and not re.match('.+mse.csv$', e)]
# allow for all to further model stacking
# for now I'm just going to use one model
#file_list = [e for e in file_list if not re.match('.+train_y.csv$', e)]
table_list = [pd.read_csv(cache_path+e) for e in file_list]
for i in range(len(file_list)):
table_list[i]['File'] = file_list[i]
yHat = pd.concat(table_list)
yHat[['Model', 'Stage', 'CV', 'Set', 'Value_Known']] = yHat['File'].str.split('_', expand = True)
yHat['CV'] = yHat['CV'].str.strip('test')
yHat.head()
| Env | Year | Date_Planted | yHat | File | Date_Harvested | Model | Stage | CV | Set | Value_Known | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ARH1_2016 | 2016 | NaN | -1.150804 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 1 | DEH1_2016 | 2016 | NaN | -0.308043 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 2 | GAH1_2016 | 2016 | NaN | -0.979720 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 3 | GAH2_2016 | 2016 | NaN | 0.080971 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 4 | KSH1_2016 | 2016 | NaN | -0.464470 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
yHat = yHat.merge(yHat_scaling)
yHat['yHat'] = (yHat['yHat']*yHat['std'])+yHat['mean']
yHat = yHat.loc[:, ['Env', 'Year', 'Model', 'CV', 'yHat']]
# TODO if using multiple models and weights, those should be applied here
yHat= yHat.groupby(['Env']).agg(yHat = ('yHat', np.nanmean)).reset_index()
yHat.head()
| Env | yHat | |
|---|---|---|
| 0 | ARH1_2016 | 101.815710 |
| 1 | ARH1_2017 | 106.963838 |
| 2 | ARH1_2018 | 107.647310 |
| 3 | ARH2_2016 | 121.680969 |
| 4 | ARH2_2017 | 111.258920 |
# Impute missing values
temp = meta
temp = temp.loc[:, ['Year', 'Date_Planted']].drop_duplicates()
temp['Date_Str'] = temp['Date_Planted'].astype(str)
temp['DOY'] = [pd.Period(e, freq='D').day_of_year for e in list(temp['Date_Str'])]
temp = meta.merge(temp).merge(yHat)
px.scatter(temp, 'DOY', 'yHat')
mask = (temp.DOY.isna())
temp.loc[mask, 'DOY'] = temp.loc[mask, 'yHat']
temp = temp.drop(columns = ['Date_Str', 'yHat', 'Date_Planted']).rename(columns = {'DOY': 'Date_Planted'})
meta = temp
summarize_col_missing(meta)
| Col | N_miss | Pr_Comp | |
|---|---|---|---|
| 0 | Env | 0 | 100.0 |
| 1 | Year | 0 | 100.0 |
| 2 | Hybrid | 0 | 100.0 |
| 3 | Date_Harvested | 12680 | 91.3 |
| 4 | Pounds_Needed_Soil_Moisture | 0 | 100.0 |
| 5 | Latitude_of_Field | 0 | 100.0 |
| 6 | Longitude_of_Field | 0 | 100.0 |
| 7 | Cover_beet | 0 | 100.0 |
| 8 | Cover_corn | 0 | 100.0 |
| 9 | Cover_cotton | 0 | 100.0 |
| 10 | Cover_fallow | 0 | 100.0 |
| 11 | Cover_lima | 0 | 100.0 |
| 12 | Cover_peanut | 0 | 100.0 |
| 13 | Cover_pumpkin | 0 | 100.0 |
| 14 | Cover_rye | 0 | 100.0 |
| 15 | Cover_sorghum | 0 | 100.0 |
| 16 | Cover_soy | 0 | 100.0 |
| 17 | Cover_wheat | 0 | 100.0 |
| 18 | Pre_Chisel | 0 | 100.0 |
| 19 | Pre_Cult | 0 | 100.0 |
| 20 | Pre_Disc | 0 | 100.0 |
| 21 | Pre_MinTill | 0 | 100.0 |
| 22 | Date_Planted | 0 | 100.0 |
# Setup ----------------------------------------------------------------------
trial_name = 'rf_impHarvest_test'
n_trials= 200
n_jobs = 20
def objective(trial):
rf_max_depth = trial.suggest_int('rf_max_depth', 2, 200, log=True)
rf_n_estimators = trial.suggest_int('rf_n_estimators', 2, 200, log=True)
rf_min_samples_split = trial.suggest_float('rf_min_samples_split', 0.01, 0.99, log=True)
regr = RandomForestRegressor(
max_depth = rf_max_depth,
n_estimators = rf_n_estimators,
min_samples_split = rf_min_samples_split
)
rf = regr.fit(train_x_2d, train_y_1d)
return (mean_squared_error(train_y_1d, rf.predict(train_x_2d), squared=False))
reset_trial_name = trial_name
if False:
for test_this_year in ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']:
trial_name = reset_trial_name
trial_name = trial_name+test_this_year
print(test_this_year)
# Data Prep. -----------------------------------------------------------------
data_obj = df_prep()
data_obj.get_train_test_Envs(df = meta,
holdout_years = [],
test_year = [test_this_year] )
data_obj.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Harvested']),
add_cols = ['Date_Harvested'])
data_obj.add_xs(df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
data_obj.isolate_missing_y()
data_obj.prep_idx_y()
data_obj.mk_scale_dict(
scale_cols = ['Date_Harvested']+[e for e in list(wthr) if e not in ['Env', 'Year', 'Date', 'DOY']],
return_cs_dict = False,
store_cs_dict = True)
data_obj.apply_scaling()
train_y, train_x = data_obj.mk_arrays(
split = 'train',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
test_y, test_x = data_obj.mk_arrays(
split = 'test',
obs_per_Env = 314,
return_2d = False,
missing_ys = False)
train_x_2d = wthr_rank_3to2(x_3d = train_x)
train_y_1d = y_rank_2to1(y_2d = train_y)
# HPS Study ------------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_hps.pkl'
if os.path.exists(cache_save_name):
study = pkl.load(open(cache_save_name, 'rb'))
else:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials= n_trials, n_jobs = n_jobs)
# save
pkl.dump(study, open(cache_save_name, 'wb'))
# Fit Best HPS ---------------------------------------------------------------
cache_save_name = cache_path+trial_name+'_mod.pkl'
if os.path.exists(cache_save_name):
rf = pkl.load(open(cache_save_name, 'rb'))
else:
regr = RandomForestRegressor(
max_depth = study.best_trial.params['rf_max_depth'],
n_estimators = study.best_trial.params['rf_n_estimators'],
min_samples_split = study.best_trial.params['rf_min_samples_split']
)
rf = regr.fit(train_x_2d, train_y_1d)
# save
pkl.dump(rf, open(cache_save_name, 'wb'))
# Eval. Best HPS -------------------------------------------------------------
for i in range(4):
# u denotes that the true value is unknown
temp_label= ['train_y', 'test_y', 'train_u', 'test_u'][i]
if os.path.exists(cache_path+trial_name+temp_label+".csv"):
pass
else:
temp_data = [data_obj.train['y'], data_obj.test['y'], data_obj.train['yna'], data_obj.test['yna']][i]
if temp_data.shape[0] != 0:
# calculation step below =============================================
temp_y, temp_x = data_obj.mk_arrays( #
split = temp_label.split('_')[0], #
obs_per_Env = 314, #
return_2d = False, #
missing_ys = temp_label.split('_')[1] == 'u' ) #
#
temp_y = y_rank_2to1(y_2d = temp_y) #
temp_yHat = rf.predict(wthr_rank_3to2(x_3d = temp_x)) #
# calculation step above =============================================
temp_data['yHat'] = list(temp_yHat)
temp_data.to_csv(cache_path+trial_name+'_'+temp_label+".csv", index = False)
if temp_label.split('_')[1] != 'u':
pd.DataFrame({'MSE':[mean_squared_error(temp_y, temp_yHat)]}).to_csv(cache_path+trial_name+'_'+temp_label+"_mse.csv", index = False)
# write out a log of the enviroments imputed
log_imputed_envs(
df = meta,
df_name = 'meta',
col = 'Date_Harvested'
)
def temp_fcn(test_this_year):
demo = df_prep()
demo.get_train_test_Envs(df = meta,
holdout_years = [],
test_year = [test_this_year] )
demo.add_ys(df_data = df_date_to_datetime(df = meta, cols = ['Date_Harvested']),
add_cols = ['Date_Harvested'])
demo.add_xs(df_data = wthr,
drop_cols = ['Year', 'Date', 'DOY'])
demo.isolate_missing_y()
demo.prep_idx_y()
out = demo.mk_scale_dict(
scale_cols = ['Date_Harvested'],
return_cs_dict = True,
store_cs_dict = False)
return(out)
# get scaling to use on predictions
years_tested = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
yHat_scaling = [pd.DataFrame(temp_fcn(test_this_year)['Date_Harvested'], index = [test_this_year]) for test_this_year in years_tested]
yHat_scaling = pd.concat(yHat_scaling).reset_index().rename(columns = {'index':'CV'})
yHat_scaling.head()
| CV | mean | std | |
|---|---|---|---|
| 0 | 2014 | 282.951724 | 31.503329 |
| 1 | 2015 | 283.971429 | 31.346096 |
| 2 | 2016 | 284.104693 | 31.989843 |
| 3 | 2017 | 284.247312 | 31.201503 |
| 4 | 2018 | 285.010909 | 30.992021 |
impute_with_mod = 'rf'
file_list = [e for e in os.listdir(cache_path) if re.match('^'+impute_with_mod+'.+.csv$', e) and not re.match('.+mse.csv$', e)]
# allow for all to further model stacking
# for now I'm just going to use one model
#file_list = [e for e in file_list if not re.match('.+train_y.csv$', e)]
table_list = [pd.read_csv(cache_path+e) for e in file_list]
for i in range(len(file_list)):
table_list[i]['File'] = file_list[i]
yHat = pd.concat(table_list)
yHat[['Model', 'Stage', 'CV', 'Set', 'Value_Known']] = yHat['File'].str.split('_', expand = True)
yHat['CV'] = yHat['CV'].str.strip('test')
yHat.head()
| Env | Year | Date_Planted | yHat | File | Date_Harvested | Model | Stage | CV | Set | Value_Known | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ARH1_2016 | 2016 | NaN | -1.150804 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 1 | DEH1_2016 | 2016 | NaN | -0.308043 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 2 | GAH1_2016 | 2016 | NaN | -0.979720 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 3 | GAH2_2016 | 2016 | NaN | 0.080971 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
| 4 | KSH1_2016 | 2016 | NaN | -0.464470 | rf_hps_test2014_train_u.csv | NaN | rf | hps | 2014 | train | u.csv |
yHat = yHat.merge(yHat_scaling)
yHat['yHat'] = (yHat['yHat']*yHat['std'])+yHat['mean']
yHat = yHat.loc[:, ['Env', 'Year', 'Model', 'CV', 'yHat']]
# TODO if using multiple models and weights, those should be applied here
yHat= yHat.groupby(['Env']).agg(yHat = ('yHat', np.nanmean)).reset_index()
yHat.head()
| Env | yHat | |
|---|---|---|
| 0 | ARH1_2016 | 248.550114 |
| 1 | ARH1_2017 | 256.140411 |
| 2 | ARH1_2018 | 257.120679 |
| 3 | ARH2_2016 | 277.717511 |
| 4 | ARH2_2017 | 262.445093 |
# Impute missing values
temp = meta
temp = temp.loc[:, ['Year', 'Date_Harvested']].drop_duplicates()
temp['Date_Str'] = temp['Date_Harvested'].astype(str)
temp['DOY'] = [pd.Period(e, freq='D').day_of_year for e in list(temp['Date_Str'])]
temp = meta.merge(temp).merge(yHat)
px.scatter(temp, 'DOY', 'yHat')
mask = (temp.DOY.isna())
temp.loc[mask, 'DOY'] = temp.loc[mask, 'yHat']
temp = temp.drop(columns = ['Date_Str', 'yHat', 'Date_Harvested']).rename(columns = {'DOY': 'Date_Harvested'})
meta = temp
print('Cols:', '\t|', 'Min Completion %:')
print('-----', '\t|', '-----------------')
for e in [meta, soil, wthr, cgmv]:
temp = summarize_col_missing(e)
print(temp.shape[0], '\t|', min(temp.Pr_Comp))
Cols: | Min Completion %: ----- | ----------------- 23 | 100.0 25 | 100.0 20 | 100.0 767 | 100.0
soil = soil.rename(columns = {
'E Depth': 'E_Depth',
'1:1 Soil pH': 'Soil_pH_1x1',
'WDRF Buffer pH': 'WDRF_Buffer_pH',
'1:1 S Salts mmho/cm': 'S_Salts_mmhopercm_1x1',
'Texture No': 'Texture_No',
'Organic Matter LOI %': 'Organic_Matter_LOI_Pr',
'Nitrate-N ppm N': 'Nitrate_N_ppm_N',
'lbs N/A': 'lbs_NperA',
'Potassium ppm K': 'Potassium_ppm_K',
'Sulfate-S ppm S': 'Sulfate_S_ppm_S',
'Calcium ppm Ca': 'Calcium_ppm_Ca',
'Magnesium ppm Mg': 'Magnesium_ppm_Mg',
'Sodium ppm Na': 'Sodium_ppm_Na',
'CEC/Sum of Cations me/100g': 'CECperSum_of_Cations_meper100g',
'%H Sat': 'PrH_Sat',
'%K Sat': 'PrK_Sat',
'%Ca Sat': 'PrCa_Sat',
'%Mg Sat': 'PrMg_Sat',
'%Na Sat': 'PrNa_Sat',
'Mehlich P-III ppm P': 'Mehlich_P_III_ppm_P',
'% Sand': 'Pr_Sand',
'% Silt': 'Pr_Silt',
'% Clay': 'Pr_Clay'
})
e = './data/Processed/'
if False:
phno.to_csv(e+'phno0.csv')
meta.to_csv(e+'meta0.csv')
soil.to_csv(e+'soil0.csv')
wthr.to_csv(e+'wthr0.csv')
cgmv.to_csv(e+'cgmv0.csv')
temp = meta.loc[:, ['Env', 'Date_Harvested', 'Date_Planted']].drop_duplicates()
temp['Duration'] = temp['Date_Harvested'] - temp['Date_Planted']
max_duration = np.nanmax(temp['Duration'])
max_duration
212.0
#The last harvest day is past the point I clipped the weather data (day 314)
np.nanmax(temp['Date_Harvested'])
348.0
# I can't get planting to harvest for all because that would go into the following year
np.nanmax(temp['Date_Planted'])+np.nanmax(temp['Duration'])
382.0
# The longest growing period that I can get for all observations is:
314-np.nanmax(temp['Date_Planted'])
144.0
# The longest period I could get before is
np.nanmin(temp['Date_Planted'])
60.0
# A full 2 months does not seem useful. If I do 36 then there will be a total of 180 observation
144/4
36.0
temp['Date_Planted'] = temp['Date_Planted'].astype(int)
temp['Start_Date'] = temp['Date_Planted'] - 36
temp['End_Date'] = temp['Date_Planted'] + 144
assert 0 == (np.std(temp['End_Date'] - temp['Start_Date']))
temp = temp.reset_index().drop(columns = 'index')
temp['Start_Date'] = temp['Start_Date'].astype(int)
temp['End_Date'] = temp['End_Date'].astype(int)
temp['Date_Planted'] = temp['Date_Planted'].astype(int)
temp['Date_Harvested'] = temp['Date_Harvested'].astype(int)
temp
| Env | Date_Harvested | Date_Planted | Duration | Start_Date | End_Date | |
|---|---|---|---|---|---|---|
| 0 | DEH1_2014 | 272 | 125 | 147.000000 | 89 | 269 |
| 1 | MOH2_2014 | 317 | 125 | 192.000000 | 89 | 269 |
| 2 | NEH3_2014 | 317 | 134 | 183.000000 | 98 | 278 |
| 3 | NEH3_2014 | 309 | 134 | 175.681915 | 98 | 278 |
| 4 | NEH3_2014 | 318 | 134 | 184.000000 | 98 | 278 |
| ... | ... | ... | ... | ... | ... | ... |
| 386 | NCH1_2021 | 263 | 111 | 152.000000 | 75 | 255 |
| 387 | NCH1_2021 | 264 | 112 | 152.000000 | 76 | 256 |
| 388 | NCH1_2021 | 267 | 112 | 155.000000 | 76 | 256 |
| 389 | NCH1_2020 | 244 | 114 | 130.000000 | 78 | 258 |
| 390 | NCH1_2020 | 245 | 114 | 131.000000 | 78 | 258 |
391 rows × 6 columns
temp = temp.loc[:, ['Env','Date_Planted',
'Start_Date', 'End_Date']].drop_duplicates()
wthr['In_Window'] = False
for i in temp.index:
Env, Start_Date, End_Date = temp.loc[i, ['Env', 'Start_Date', 'End_Date']]
mask = ( (wthr.Env == Env
) & (wthr.DOY >= Start_Date
) & (wthr.DOY < End_Date))
wthr.loc[mask, 'In_Window'] = True
mask = wthr.In_Window
temp = wthr.loc[mask, ]
min_days = temp.groupby('Env').agg(Min_DOY = ('DOY', np.min)).reset_index()
temp = temp.merge(min_days, how = "outer")
temp['Day'] = 1+(temp['DOY'] - temp['Min_DOY'])
temp = temp.drop(columns = ['Year', 'Date', 'In_Window', 'Min_DOY'])
temp = temp.melt(id_vars = ['Env', 'DOY'])
temp['variable'] = temp['variable'] +'_Day'+ temp['DOY'].astype(str)
temp = temp.drop(columns = ['DOY'])
temp
| Env | variable | value | |
|---|---|---|---|
| 0 | TXH1_2014 | QV2M_Day24 | 2.69 |
| 1 | TXH1_2014 | QV2M_Day25 | 4.03 |
| 2 | TXH1_2014 | QV2M_Day26 | 6.16 |
| 3 | TXH1_2014 | QV2M_Day27 | 4.46 |
| 4 | TXH1_2014 | QV2M_Day28 | 2.69 |
| ... | ... | ... | ... |
| 738135 | NYH3_2022 | Day_Day283 | 176.00 |
| 738136 | NYH3_2022 | Day_Day284 | 177.00 |
| 738137 | NYH3_2022 | Day_Day285 | 178.00 |
| 738138 | NYH3_2022 | Day_Day286 | 179.00 |
| 738139 | NYH3_2022 | Day_Day287 | 180.00 |
738140 rows × 3 columns
# This here is causing problems.
# this is going to be SLOW
# temp.loc[(temp.Env.isin(['TXH1_2014', 'NYH3_2022'])), ].pivot(index = ['Env'], columns=['variable']).reset_index()
temp_wide = temp.loc[:, ].pivot(index = ['Env'], columns=['variable']).reset_index()
test = summarize_col_missing(temp_wide)
np.min(test.loc[:, 'Pr_Comp'])
test.sort_values('Pr_Comp')
| Col | N_miss | Pr_Comp | |
|---|---|---|---|
| 2751 | (value, QV2M_Day24) | 239 | 0.4 |
| 3410 | (value, T2MDEW_Day311) | 239 | 0.4 |
| 3409 | (value, T2MDEW_Day310) | 239 | 0.4 |
| 512 | (value, ALLSKY_SFC_SW_DNI_Day313) | 239 | 0.4 |
| 511 | (value, ALLSKY_SFC_SW_DNI_Day312) | 239 | 0.4 |
| ... | ... | ... | ... |
| 967 | (value, Day_Day196) | 0 | 100.0 |
| 968 | (value, Day_Day197) | 0 | 100.0 |
| 969 | (value, Day_Day198) | 0 | 100.0 |
| 962 | (value, Day_Day191) | 0 | 100.0 |
| 0 | (Env, ) | 0 | 100.0 |
4931 rows × 3 columns
if False:
temp_wide.to_csv(e+'wthrWide0.csv')